diff --git a/checkpoint-12208/config.json b/checkpoint-12208/config.json new file mode 100644 index 0000000000000000000000000000000000000000..1a2e378a7091e0c0c8eaa10d0bd7796835fe4b64 --- /dev/null +++ b/checkpoint-12208/config.json @@ -0,0 +1,39 @@ +{ + "_name_or_path": "meta-llama/Llama-3.1-8B-Instruct", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 8.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "torch_dtype": "float32", + "transformers_version": "4.44.0.dev0", + "use_cache": true, + "vocab_size": 128256 +} diff --git a/checkpoint-12208/generation_config.json b/checkpoint-12208/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c0d952a3697e90a24460d766a494a8bb9cf80534 --- /dev/null +++ b/checkpoint-12208/generation_config.json @@ -0,0 +1,12 @@ +{ + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.44.0.dev0" +} diff --git a/checkpoint-12208/model-00001-of-00007.safetensors b/checkpoint-12208/model-00001-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..7cf6c1a458b78841238e453554b8340a65dddbb8 --- /dev/null +++ b/checkpoint-12208/model-00001-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a71a91c640f43f365d32e726753fceb11aa5cfcea684cd500a2d11b1bc2425b +size 4886466168 diff --git a/checkpoint-12208/model-00002-of-00007.safetensors b/checkpoint-12208/model-00002-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..fbc8e97904ed5acdba6240901a52d7ea3c73628d --- /dev/null +++ b/checkpoint-12208/model-00002-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:23fa5aaf8ac56c17c821980c9aee44fcbc2a617fc570e875e6fdb93d92886c72 +size 4832007448 diff --git a/checkpoint-12208/model-00003-of-00007.safetensors b/checkpoint-12208/model-00003-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..aeab8b855338147792249baf9c680bfb29ea42a5 --- /dev/null +++ b/checkpoint-12208/model-00003-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a223e87240e50dfb05634ea909cf1ec715d37a12b681b14a3a6e97f28688f2b +size 4999813112 diff --git a/checkpoint-12208/model-00004-of-00007.safetensors b/checkpoint-12208/model-00004-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ac9e7b6b5edbe63eec2dd4c7d3310db1f10fdda9 --- /dev/null +++ b/checkpoint-12208/model-00004-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3047d199c611c6d12b1cca780f033ad89df9c076f742674f7ac4a892ca9573b3 +size 4999813128 diff --git a/checkpoint-12208/model-00005-of-00007.safetensors b/checkpoint-12208/model-00005-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..48718503a952853ffd5e231b9dba74b7026e0e01 --- /dev/null +++ b/checkpoint-12208/model-00005-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a2f91d749d664ecd22c71f3d54488966fc6899a03924b8fea3613c8f83398a8 +size 4832007496 diff --git a/checkpoint-12208/model-00006-of-00007.safetensors b/checkpoint-12208/model-00006-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..cce8251e8010ddcd804df7d4501e74541237b64b --- /dev/null +++ b/checkpoint-12208/model-00006-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:19b5e163a9dc10542a1ab970cbb083b1b043ace6eac18f0c0c2634cc2db53fe3 +size 4999813120 diff --git a/checkpoint-12208/model-00007-of-00007.safetensors b/checkpoint-12208/model-00007-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..b1bd09f99c1b224d1032319b47399bfc67bf9ca1 --- /dev/null +++ b/checkpoint-12208/model-00007-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bca3455f64a8290bf8ba6646383874b92639d835b04b99508f6b80b3c08a5d87 +size 2571158184 diff --git a/checkpoint-12208/model.safetensors.index.json b/checkpoint-12208/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..0985084320f65a678bea622edb50d3771af4fd13 --- /dev/null +++ b/checkpoint-12208/model.safetensors.index.json @@ -0,0 +1,298 @@ +{ + "metadata": { + "total_size": 32121044992 + }, + "weight_map": { + "lm_head.weight": "model-00007-of-00007.safetensors", + "model.embed_tokens.weight": "model-00001-of-00007.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.10.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.15.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.20.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.21.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.26.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.3.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.30.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.input_layernorm.weight": "model-00007-of-00007.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00007-of-00007.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00007-of-00007.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00007-of-00007.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.4.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.9.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.norm.weight": "model-00007-of-00007.safetensors" + } +} diff --git a/checkpoint-12208/optimizer.pt b/checkpoint-12208/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..61e7eccb5755cefe00b43b301ddf2143664d083a --- /dev/null +++ b/checkpoint-12208/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:14043bfd4351f738ad6ae147081b8b3d9d4f1cff763c551607c3bf2d96dd1c35 +size 15385036334 diff --git a/checkpoint-12208/rng_state.pth b/checkpoint-12208/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..897ad19948758b40c785c678c136c4029433a27a --- /dev/null +++ b/checkpoint-12208/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d9cd6a0487226e5bd30d1846894c82af483733ab4381b75bae9c0745e05d405 +size 14244 diff --git a/checkpoint-12208/scheduler.pt b/checkpoint-12208/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..ed3b066b7a770b71a74f026fa108a814ac17f832 --- /dev/null +++ b/checkpoint-12208/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:abe644ed33a3c4139223f0857a985127f3e6fbaa8c89fa14b57671b49ca52c21 +size 1064 diff --git a/checkpoint-12208/trainer_state.json b/checkpoint-12208/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..42be4548b6411b8a771406288bcb5b05d44964fb --- /dev/null +++ b/checkpoint-12208/trainer_state.json @@ -0,0 +1,2784 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.17927565761697598, + "eval_steps": 500, + "global_step": 12208, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0004552379903445491, + "grad_norm": 4.0020222663879395, + "learning_rate": 1.0157273918741808e-06, + "loss": 0.92, + "step": 31 + }, + { + "epoch": 0.0009104759806890982, + "grad_norm": 3.06746506690979, + "learning_rate": 2.0314547837483616e-06, + "loss": 0.8898, + "step": 62 + }, + { + "epoch": 0.0013657139710336473, + "grad_norm": 3.004636764526367, + "learning_rate": 3.0471821756225426e-06, + "loss": 0.8653, + "step": 93 + }, + { + "epoch": 0.0018209519613781964, + "grad_norm": 3.0733370780944824, + "learning_rate": 4.062909567496723e-06, + "loss": 0.8516, + "step": 124 + }, + { + "epoch": 0.0022761899517227454, + "grad_norm": 2.898719549179077, + "learning_rate": 5.078636959370905e-06, + "loss": 0.8461, + "step": 155 + }, + { + "epoch": 0.0027314279420672946, + "grad_norm": 2.8273258209228516, + "learning_rate": 6.094364351245085e-06, + "loss": 0.8362, + "step": 186 + }, + { + "epoch": 0.0031866659324118435, + "grad_norm": 2.9983041286468506, + "learning_rate": 7.110091743119267e-06, + "loss": 0.8274, + "step": 217 + }, + { + "epoch": 0.0036419039227563927, + "grad_norm": 2.9561767578125, + "learning_rate": 8.125819134993446e-06, + "loss": 0.8373, + "step": 248 + }, + { + "epoch": 0.0040971419131009415, + "grad_norm": 2.732726812362671, + "learning_rate": 9.141546526867629e-06, + "loss": 0.8342, + "step": 279 + }, + { + "epoch": 0.004552379903445491, + "grad_norm": 2.8236358165740967, + "learning_rate": 1.015727391874181e-05, + "loss": 0.8387, + "step": 310 + }, + { + "epoch": 0.00500761789379004, + "grad_norm": 2.8730266094207764, + "learning_rate": 1.117300131061599e-05, + "loss": 0.8328, + "step": 341 + }, + { + "epoch": 0.005462855884134589, + "grad_norm": 2.717439651489258, + "learning_rate": 1.218872870249017e-05, + "loss": 0.8316, + "step": 372 + }, + { + "epoch": 0.005918093874479138, + "grad_norm": 2.9625961780548096, + "learning_rate": 1.3204456094364351e-05, + "loss": 0.8295, + "step": 403 + }, + { + "epoch": 0.006373331864823687, + "grad_norm": 2.6649422645568848, + "learning_rate": 1.4220183486238533e-05, + "loss": 0.8356, + "step": 434 + }, + { + "epoch": 0.006828569855168236, + "grad_norm": 2.9066264629364014, + "learning_rate": 1.5235910878112714e-05, + "loss": 0.8313, + "step": 465 + }, + { + "epoch": 0.0072838078455127854, + "grad_norm": 2.7710695266723633, + "learning_rate": 1.6251638269986893e-05, + "loss": 0.8251, + "step": 496 + }, + { + "epoch": 0.007739045835857335, + "grad_norm": 2.416724443435669, + "learning_rate": 1.7267365661861077e-05, + "loss": 0.8319, + "step": 527 + }, + { + "epoch": 0.008194283826201883, + "grad_norm": 2.583005905151367, + "learning_rate": 1.8283093053735257e-05, + "loss": 0.8265, + "step": 558 + }, + { + "epoch": 0.008649521816546433, + "grad_norm": 2.6601760387420654, + "learning_rate": 1.9298820445609438e-05, + "loss": 0.8328, + "step": 589 + }, + { + "epoch": 0.009104759806890982, + "grad_norm": 2.9271795749664307, + "learning_rate": 2.031454783748362e-05, + "loss": 0.8282, + "step": 620 + }, + { + "epoch": 0.00955999779723553, + "grad_norm": 2.3631091117858887, + "learning_rate": 2.13302752293578e-05, + "loss": 0.8314, + "step": 651 + }, + { + "epoch": 0.01001523578758008, + "grad_norm": 2.4002573490142822, + "learning_rate": 2.234600262123198e-05, + "loss": 0.829, + "step": 682 + }, + { + "epoch": 0.010470473777924628, + "grad_norm": 2.629352331161499, + "learning_rate": 2.336173001310616e-05, + "loss": 0.8315, + "step": 713 + }, + { + "epoch": 0.010925711768269179, + "grad_norm": 2.3604094982147217, + "learning_rate": 2.437745740498034e-05, + "loss": 0.8338, + "step": 744 + }, + { + "epoch": 0.011380949758613727, + "grad_norm": 2.8708078861236572, + "learning_rate": 2.5393184796854525e-05, + "loss": 0.843, + "step": 775 + }, + { + "epoch": 0.011836187748958275, + "grad_norm": 2.7403945922851562, + "learning_rate": 2.6408912188728702e-05, + "loss": 0.8372, + "step": 806 + }, + { + "epoch": 0.012291425739302825, + "grad_norm": 2.733816623687744, + "learning_rate": 2.7424639580602886e-05, + "loss": 0.8327, + "step": 837 + }, + { + "epoch": 0.012746663729647374, + "grad_norm": 2.2656960487365723, + "learning_rate": 2.8440366972477066e-05, + "loss": 0.8357, + "step": 868 + }, + { + "epoch": 0.013201901719991924, + "grad_norm": 2.18038010597229, + "learning_rate": 2.9456094364351244e-05, + "loss": 0.8331, + "step": 899 + }, + { + "epoch": 0.013657139710336472, + "grad_norm": 2.497558832168579, + "learning_rate": 3.0471821756225428e-05, + "loss": 0.8271, + "step": 930 + }, + { + "epoch": 0.01411237770068102, + "grad_norm": 2.322632074356079, + "learning_rate": 3.148754914809961e-05, + "loss": 0.8327, + "step": 961 + }, + { + "epoch": 0.014567615691025571, + "grad_norm": 2.5596141815185547, + "learning_rate": 3.2503276539973785e-05, + "loss": 0.8367, + "step": 992 + }, + { + "epoch": 0.01502285368137012, + "grad_norm": 2.1262409687042236, + "learning_rate": 3.351900393184797e-05, + "loss": 0.8192, + "step": 1023 + }, + { + "epoch": 0.01547809167171467, + "grad_norm": 2.2483584880828857, + "learning_rate": 3.453473132372215e-05, + "loss": 0.8352, + "step": 1054 + }, + { + "epoch": 0.015933329662059216, + "grad_norm": 2.224043130874634, + "learning_rate": 3.555045871559633e-05, + "loss": 0.8362, + "step": 1085 + }, + { + "epoch": 0.016388567652403766, + "grad_norm": 2.104788303375244, + "learning_rate": 3.6566186107470514e-05, + "loss": 0.832, + "step": 1116 + }, + { + "epoch": 0.016843805642748316, + "grad_norm": 2.126499652862549, + "learning_rate": 3.7581913499344695e-05, + "loss": 0.8341, + "step": 1147 + }, + { + "epoch": 0.017299043633092866, + "grad_norm": 2.221691370010376, + "learning_rate": 3.8597640891218876e-05, + "loss": 0.8346, + "step": 1178 + }, + { + "epoch": 0.017754281623437413, + "grad_norm": 2.4445159435272217, + "learning_rate": 3.9613368283093056e-05, + "loss": 0.8415, + "step": 1209 + }, + { + "epoch": 0.018209519613781963, + "grad_norm": 2.321516752243042, + "learning_rate": 4.062909567496724e-05, + "loss": 0.8289, + "step": 1240 + }, + { + "epoch": 0.018664757604126513, + "grad_norm": 2.2673656940460205, + "learning_rate": 4.164482306684142e-05, + "loss": 0.8307, + "step": 1271 + }, + { + "epoch": 0.01911999559447106, + "grad_norm": 2.1191513538360596, + "learning_rate": 4.26605504587156e-05, + "loss": 0.8319, + "step": 1302 + }, + { + "epoch": 0.01957523358481561, + "grad_norm": 2.18985652923584, + "learning_rate": 4.367627785058978e-05, + "loss": 0.8398, + "step": 1333 + }, + { + "epoch": 0.02003047157516016, + "grad_norm": 1.9449113607406616, + "learning_rate": 4.469200524246396e-05, + "loss": 0.8304, + "step": 1364 + }, + { + "epoch": 0.020485709565504707, + "grad_norm": 2.0119357109069824, + "learning_rate": 4.570773263433814e-05, + "loss": 0.82, + "step": 1395 + }, + { + "epoch": 0.020940947555849257, + "grad_norm": 2.0013489723205566, + "learning_rate": 4.672346002621232e-05, + "loss": 0.8275, + "step": 1426 + }, + { + "epoch": 0.021396185546193807, + "grad_norm": 2.0175704956054688, + "learning_rate": 4.77391874180865e-05, + "loss": 0.8348, + "step": 1457 + }, + { + "epoch": 0.021851423536538357, + "grad_norm": 2.0555551052093506, + "learning_rate": 4.875491480996068e-05, + "loss": 0.8322, + "step": 1488 + }, + { + "epoch": 0.022306661526882904, + "grad_norm": 1.9871079921722412, + "learning_rate": 4.977064220183487e-05, + "loss": 0.8253, + "step": 1519 + }, + { + "epoch": 0.022761899517227454, + "grad_norm": 2.1991310119628906, + "learning_rate": 4.9999915451558777e-05, + "loss": 0.8252, + "step": 1550 + }, + { + "epoch": 0.023217137507572004, + "grad_norm": 1.96790611743927, + "learning_rate": 4.999955597496219e-05, + "loss": 0.8242, + "step": 1581 + }, + { + "epoch": 0.02367237549791655, + "grad_norm": 2.0572164058685303, + "learning_rate": 4.9998914381774255e-05, + "loss": 0.833, + "step": 1612 + }, + { + "epoch": 0.0241276134882611, + "grad_norm": 2.0381035804748535, + "learning_rate": 4.999799067923527e-05, + "loss": 0.8351, + "step": 1643 + }, + { + "epoch": 0.02458285147860565, + "grad_norm": 2.145235061645508, + "learning_rate": 4.999678487776908e-05, + "loss": 0.842, + "step": 1674 + }, + { + "epoch": 0.025038089468950198, + "grad_norm": 2.0120856761932373, + "learning_rate": 4.9995296990983006e-05, + "loss": 0.838, + "step": 1705 + }, + { + "epoch": 0.025493327459294748, + "grad_norm": 1.9461947679519653, + "learning_rate": 4.999352703566763e-05, + "loss": 0.8269, + "step": 1736 + }, + { + "epoch": 0.025948565449639298, + "grad_norm": 1.9189060926437378, + "learning_rate": 4.999147503179668e-05, + "loss": 0.8214, + "step": 1767 + }, + { + "epoch": 0.026403803439983848, + "grad_norm": 1.906604290008545, + "learning_rate": 4.998914100252672e-05, + "loss": 0.8249, + "step": 1798 + }, + { + "epoch": 0.026859041430328395, + "grad_norm": 2.0810904502868652, + "learning_rate": 4.998652497419696e-05, + "loss": 0.8143, + "step": 1829 + }, + { + "epoch": 0.027314279420672945, + "grad_norm": 1.9607690572738647, + "learning_rate": 4.9983626976328927e-05, + "loss": 0.8155, + "step": 1860 + }, + { + "epoch": 0.027769517411017495, + "grad_norm": 1.8492732048034668, + "learning_rate": 4.998044704162613e-05, + "loss": 0.8168, + "step": 1891 + }, + { + "epoch": 0.02822475540136204, + "grad_norm": 2.0616824626922607, + "learning_rate": 4.9976985205973705e-05, + "loss": 0.8279, + "step": 1922 + }, + { + "epoch": 0.02867999339170659, + "grad_norm": 1.882591724395752, + "learning_rate": 4.997324150843799e-05, + "loss": 0.8199, + "step": 1953 + }, + { + "epoch": 0.029135231382051142, + "grad_norm": 1.8001904487609863, + "learning_rate": 4.99692159912661e-05, + "loss": 0.8232, + "step": 1984 + }, + { + "epoch": 0.02959046937239569, + "grad_norm": 1.94768226146698, + "learning_rate": 4.996490869988546e-05, + "loss": 0.817, + "step": 2015 + }, + { + "epoch": 0.03004570736274024, + "grad_norm": 1.845452904701233, + "learning_rate": 4.996031968290326e-05, + "loss": 0.8146, + "step": 2046 + }, + { + "epoch": 0.03050094535308479, + "grad_norm": 1.8649152517318726, + "learning_rate": 4.995544899210594e-05, + "loss": 0.8246, + "step": 2077 + }, + { + "epoch": 0.03095618334342934, + "grad_norm": 1.665781021118164, + "learning_rate": 4.9950296682458583e-05, + "loss": 0.8004, + "step": 2108 + }, + { + "epoch": 0.031411421333773885, + "grad_norm": 1.6481060981750488, + "learning_rate": 4.994486281210429e-05, + "loss": 0.7949, + "step": 2139 + }, + { + "epoch": 0.03186665932411843, + "grad_norm": 1.7481175661087036, + "learning_rate": 4.9939147442363566e-05, + "loss": 0.8142, + "step": 2170 + }, + { + "epoch": 0.032321897314462986, + "grad_norm": 1.6130069494247437, + "learning_rate": 4.9933150637733574e-05, + "loss": 0.8126, + "step": 2201 + }, + { + "epoch": 0.03277713530480753, + "grad_norm": 1.7536373138427734, + "learning_rate": 4.992687246588743e-05, + "loss": 0.8149, + "step": 2232 + }, + { + "epoch": 0.033232373295152086, + "grad_norm": 1.721200942993164, + "learning_rate": 4.992031299767347e-05, + "loss": 0.8051, + "step": 2263 + }, + { + "epoch": 0.03368761128549663, + "grad_norm": 3.3350446224212646, + "learning_rate": 4.9913472307114386e-05, + "loss": 0.8007, + "step": 2294 + }, + { + "epoch": 0.03414284927584118, + "grad_norm": 1.848281741142273, + "learning_rate": 4.9906350471406446e-05, + "loss": 0.8076, + "step": 2325 + }, + { + "epoch": 0.03459808726618573, + "grad_norm": 1.6283164024353027, + "learning_rate": 4.989894757091861e-05, + "loss": 0.8084, + "step": 2356 + }, + { + "epoch": 0.03505332525653028, + "grad_norm": 1.6567096710205078, + "learning_rate": 4.989126368919158e-05, + "loss": 0.8075, + "step": 2387 + }, + { + "epoch": 0.035508563246874826, + "grad_norm": 1.6488817930221558, + "learning_rate": 4.988329891293693e-05, + "loss": 0.7994, + "step": 2418 + }, + { + "epoch": 0.03596380123721938, + "grad_norm": 1.9465230703353882, + "learning_rate": 4.987505333203608e-05, + "loss": 0.8064, + "step": 2449 + }, + { + "epoch": 0.036419039227563926, + "grad_norm": 1.722517967224121, + "learning_rate": 4.9866527039539276e-05, + "loss": 0.8005, + "step": 2480 + }, + { + "epoch": 0.03687427721790847, + "grad_norm": 1.8509281873703003, + "learning_rate": 4.9857720131664594e-05, + "loss": 0.8094, + "step": 2511 + }, + { + "epoch": 0.03732951520825303, + "grad_norm": 1.7140249013900757, + "learning_rate": 4.9848632707796773e-05, + "loss": 0.8018, + "step": 2542 + }, + { + "epoch": 0.03778475319859757, + "grad_norm": 1.9315160512924194, + "learning_rate": 4.9839264870486155e-05, + "loss": 0.8094, + "step": 2573 + }, + { + "epoch": 0.03823999118894212, + "grad_norm": 1.728063702583313, + "learning_rate": 4.9829616725447526e-05, + "loss": 0.8052, + "step": 2604 + }, + { + "epoch": 0.03869522917928667, + "grad_norm": 1.588205099105835, + "learning_rate": 4.981968838155888e-05, + "loss": 0.7983, + "step": 2635 + }, + { + "epoch": 0.03915046716963122, + "grad_norm": 1.6086421012878418, + "learning_rate": 4.980947995086024e-05, + "loss": 0.7961, + "step": 2666 + }, + { + "epoch": 0.03960570515997577, + "grad_norm": 1.680635929107666, + "learning_rate": 4.979899154855234e-05, + "loss": 0.8033, + "step": 2697 + }, + { + "epoch": 0.04006094315032032, + "grad_norm": 1.6833465099334717, + "learning_rate": 4.9788223292995386e-05, + "loss": 0.793, + "step": 2728 + }, + { + "epoch": 0.04051618114066487, + "grad_norm": 1.5738922357559204, + "learning_rate": 4.977717530570768e-05, + "loss": 0.7837, + "step": 2759 + }, + { + "epoch": 0.040971419131009414, + "grad_norm": 1.6513015031814575, + "learning_rate": 4.976584771136425e-05, + "loss": 0.7969, + "step": 2790 + }, + { + "epoch": 0.04142665712135397, + "grad_norm": 1.7180182933807373, + "learning_rate": 4.975424063779547e-05, + "loss": 0.7994, + "step": 2821 + }, + { + "epoch": 0.041881895111698514, + "grad_norm": 1.5236577987670898, + "learning_rate": 4.974235421598557e-05, + "loss": 0.7952, + "step": 2852 + }, + { + "epoch": 0.04233713310204307, + "grad_norm": 1.760188341140747, + "learning_rate": 4.973018858007122e-05, + "loss": 0.795, + "step": 2883 + }, + { + "epoch": 0.042792371092387614, + "grad_norm": 1.5314075946807861, + "learning_rate": 4.9717743867339963e-05, + "loss": 0.7884, + "step": 2914 + }, + { + "epoch": 0.04324760908273216, + "grad_norm": 1.7256252765655518, + "learning_rate": 4.9705020218228695e-05, + "loss": 0.7883, + "step": 2945 + }, + { + "epoch": 0.043702847073076714, + "grad_norm": 1.6262825727462769, + "learning_rate": 4.969201777632205e-05, + "loss": 0.8007, + "step": 2976 + }, + { + "epoch": 0.04415808506342126, + "grad_norm": 1.4939130544662476, + "learning_rate": 4.9678736688350846e-05, + "loss": 0.7862, + "step": 3007 + }, + { + "epoch": 0.04461332305376581, + "grad_norm": 1.6776609420776367, + "learning_rate": 4.966517710419033e-05, + "loss": 0.7977, + "step": 3038 + }, + { + "epoch": 0.04506856104411036, + "grad_norm": 1.7425099611282349, + "learning_rate": 4.965133917685858e-05, + "loss": 0.789, + "step": 3069 + }, + { + "epoch": 0.04552379903445491, + "grad_norm": 1.5114233493804932, + "learning_rate": 4.9637223062514714e-05, + "loss": 0.7915, + "step": 3100 + }, + { + "epoch": 0.045979037024799455, + "grad_norm": 1.6397335529327393, + "learning_rate": 4.962282892045718e-05, + "loss": 0.8127, + "step": 3131 + }, + { + "epoch": 0.04643427501514401, + "grad_norm": 1.7266038656234741, + "learning_rate": 4.9608156913121904e-05, + "loss": 0.827, + "step": 3162 + }, + { + "epoch": 0.046889513005488555, + "grad_norm": 1.5216429233551025, + "learning_rate": 4.959320720608049e-05, + "loss": 0.8146, + "step": 3193 + }, + { + "epoch": 0.0473447509958331, + "grad_norm": 1.9235813617706299, + "learning_rate": 4.9577979968038354e-05, + "loss": 0.8118, + "step": 3224 + }, + { + "epoch": 0.047799988986177655, + "grad_norm": 1.6768368482589722, + "learning_rate": 4.956247537083282e-05, + "loss": 0.7925, + "step": 3255 + }, + { + "epoch": 0.0482552269765222, + "grad_norm": 1.6357700824737549, + "learning_rate": 4.9546693589431145e-05, + "loss": 0.7743, + "step": 3286 + }, + { + "epoch": 0.04871046496686675, + "grad_norm": 1.6823676824569702, + "learning_rate": 4.9530634801928595e-05, + "loss": 0.7949, + "step": 3317 + }, + { + "epoch": 0.0491657029572113, + "grad_norm": 1.4716497659683228, + "learning_rate": 4.9514299189546395e-05, + "loss": 0.7755, + "step": 3348 + }, + { + "epoch": 0.04962094094755585, + "grad_norm": 3.1948273181915283, + "learning_rate": 4.949768693662973e-05, + "loss": 0.7867, + "step": 3379 + }, + { + "epoch": 0.050076178937900395, + "grad_norm": 1.7412703037261963, + "learning_rate": 4.948079823064559e-05, + "loss": 0.788, + "step": 3410 + }, + { + "epoch": 0.05053141692824495, + "grad_norm": 1.5544873476028442, + "learning_rate": 4.946363326218074e-05, + "loss": 0.7752, + "step": 3441 + }, + { + "epoch": 0.050986654918589495, + "grad_norm": 1.5616456270217896, + "learning_rate": 4.9446192224939525e-05, + "loss": 0.7752, + "step": 3472 + }, + { + "epoch": 0.05144189290893404, + "grad_norm": 1.4451948404312134, + "learning_rate": 4.942847531574167e-05, + "loss": 0.7786, + "step": 3503 + }, + { + "epoch": 0.051897130899278596, + "grad_norm": 1.5473634004592896, + "learning_rate": 4.941048273452008e-05, + "loss": 0.7824, + "step": 3534 + }, + { + "epoch": 0.05235236888962314, + "grad_norm": 1.4772653579711914, + "learning_rate": 4.9392214684318605e-05, + "loss": 0.7873, + "step": 3565 + }, + { + "epoch": 0.052807606879967696, + "grad_norm": 1.4708329439163208, + "learning_rate": 4.93736713712897e-05, + "loss": 0.7851, + "step": 3596 + }, + { + "epoch": 0.05326284487031224, + "grad_norm": 1.469369649887085, + "learning_rate": 4.9354853004692124e-05, + "loss": 0.7874, + "step": 3627 + }, + { + "epoch": 0.05371808286065679, + "grad_norm": 1.3600234985351562, + "learning_rate": 4.93357597968886e-05, + "loss": 0.7792, + "step": 3658 + }, + { + "epoch": 0.05417332085100134, + "grad_norm": 1.6479856967926025, + "learning_rate": 4.931639196334338e-05, + "loss": 0.7886, + "step": 3689 + }, + { + "epoch": 0.05462855884134589, + "grad_norm": 1.5576198101043701, + "learning_rate": 4.9296749722619826e-05, + "loss": 0.7873, + "step": 3720 + }, + { + "epoch": 0.055083796831690436, + "grad_norm": 1.5949997901916504, + "learning_rate": 4.9276833296377966e-05, + "loss": 0.7794, + "step": 3751 + }, + { + "epoch": 0.05553903482203499, + "grad_norm": 1.3033227920532227, + "learning_rate": 4.925664290937196e-05, + "loss": 0.7763, + "step": 3782 + }, + { + "epoch": 0.055994272812379536, + "grad_norm": 1.5083190202713013, + "learning_rate": 4.9236178789447576e-05, + "loss": 0.7796, + "step": 3813 + }, + { + "epoch": 0.05644951080272408, + "grad_norm": 1.4691548347473145, + "learning_rate": 4.921544116753962e-05, + "loss": 0.7744, + "step": 3844 + }, + { + "epoch": 0.05690474879306864, + "grad_norm": 1.4447206258773804, + "learning_rate": 4.919443027766935e-05, + "loss": 0.7812, + "step": 3875 + }, + { + "epoch": 0.05735998678341318, + "grad_norm": 1.4616878032684326, + "learning_rate": 4.91731463569418e-05, + "loss": 0.7824, + "step": 3906 + }, + { + "epoch": 0.05781522477375773, + "grad_norm": 1.4808290004730225, + "learning_rate": 4.915158964554312e-05, + "loss": 0.7777, + "step": 3937 + }, + { + "epoch": 0.058270462764102283, + "grad_norm": 1.4712798595428467, + "learning_rate": 4.912976038673786e-05, + "loss": 0.7801, + "step": 3968 + }, + { + "epoch": 0.05872570075444683, + "grad_norm": 1.4899425506591797, + "learning_rate": 4.9107658826866254e-05, + "loss": 0.7852, + "step": 3999 + }, + { + "epoch": 0.05918093874479138, + "grad_norm": 1.2921983003616333, + "learning_rate": 4.908528521534139e-05, + "loss": 0.7735, + "step": 4030 + }, + { + "epoch": 0.05963617673513593, + "grad_norm": 1.3849194049835205, + "learning_rate": 4.906263980464644e-05, + "loss": 0.7893, + "step": 4061 + }, + { + "epoch": 0.06009141472548048, + "grad_norm": 1.442808747291565, + "learning_rate": 4.903972285033178e-05, + "loss": 0.784, + "step": 4092 + }, + { + "epoch": 0.060546652715825024, + "grad_norm": 1.4925036430358887, + "learning_rate": 4.901653461101213e-05, + "loss": 0.7714, + "step": 4123 + }, + { + "epoch": 0.06100189070616958, + "grad_norm": 1.4630024433135986, + "learning_rate": 4.8993075348363626e-05, + "loss": 0.7677, + "step": 4154 + }, + { + "epoch": 0.061457128696514124, + "grad_norm": 1.344081163406372, + "learning_rate": 4.896934532712084e-05, + "loss": 0.7788, + "step": 4185 + }, + { + "epoch": 0.06191236668685868, + "grad_norm": 1.449648380279541, + "learning_rate": 4.8945344815073846e-05, + "loss": 0.7661, + "step": 4216 + }, + { + "epoch": 0.062367604677203224, + "grad_norm": 1.450361728668213, + "learning_rate": 4.892107408306516e-05, + "loss": 0.7698, + "step": 4247 + }, + { + "epoch": 0.06282284266754777, + "grad_norm": 1.5068191289901733, + "learning_rate": 4.889653340498669e-05, + "loss": 0.7758, + "step": 4278 + }, + { + "epoch": 0.06327808065789232, + "grad_norm": 1.3638824224472046, + "learning_rate": 4.8871723057776664e-05, + "loss": 0.7702, + "step": 4309 + }, + { + "epoch": 0.06373331864823686, + "grad_norm": 1.4669525623321533, + "learning_rate": 4.8846643321416476e-05, + "loss": 0.7755, + "step": 4340 + }, + { + "epoch": 0.06418855663858142, + "grad_norm": 1.502684235572815, + "learning_rate": 4.882129447892753e-05, + "loss": 0.7677, + "step": 4371 + }, + { + "epoch": 0.06464379462892597, + "grad_norm": 1.5910828113555908, + "learning_rate": 4.8795676816368076e-05, + "loss": 0.7646, + "step": 4402 + }, + { + "epoch": 0.06509903261927051, + "grad_norm": 1.4607537984848022, + "learning_rate": 4.876979062282995e-05, + "loss": 0.7684, + "step": 4433 + }, + { + "epoch": 0.06555427060961506, + "grad_norm": 1.3823063373565674, + "learning_rate": 4.8743636190435325e-05, + "loss": 0.7695, + "step": 4464 + }, + { + "epoch": 0.06600950859995962, + "grad_norm": 1.3769381046295166, + "learning_rate": 4.871721381433344e-05, + "loss": 0.7746, + "step": 4495 + }, + { + "epoch": 0.06646474659030417, + "grad_norm": 1.3154246807098389, + "learning_rate": 4.869052379269719e-05, + "loss": 0.7699, + "step": 4526 + }, + { + "epoch": 0.06691998458064871, + "grad_norm": 1.4395617246627808, + "learning_rate": 4.866356642671985e-05, + "loss": 0.7654, + "step": 4557 + }, + { + "epoch": 0.06737522257099327, + "grad_norm": 1.333095908164978, + "learning_rate": 4.8636342020611634e-05, + "loss": 0.7729, + "step": 4588 + }, + { + "epoch": 0.06783046056133782, + "grad_norm": 1.3427667617797852, + "learning_rate": 4.860885088159626e-05, + "loss": 0.7701, + "step": 4619 + }, + { + "epoch": 0.06828569855168236, + "grad_norm": 1.4095017910003662, + "learning_rate": 4.858109331990751e-05, + "loss": 0.7711, + "step": 4650 + }, + { + "epoch": 0.06874093654202691, + "grad_norm": 1.4199680089950562, + "learning_rate": 4.855306964878567e-05, + "loss": 0.7797, + "step": 4681 + }, + { + "epoch": 0.06919617453237147, + "grad_norm": 1.3505098819732666, + "learning_rate": 4.8524780184474084e-05, + "loss": 0.7702, + "step": 4712 + }, + { + "epoch": 0.069651412522716, + "grad_norm": 1.3765984773635864, + "learning_rate": 4.8496225246215496e-05, + "loss": 0.7647, + "step": 4743 + }, + { + "epoch": 0.07010665051306056, + "grad_norm": 1.539703369140625, + "learning_rate": 4.8467405156248505e-05, + "loss": 0.7673, + "step": 4774 + }, + { + "epoch": 0.07056188850340511, + "grad_norm": 1.6388696432113647, + "learning_rate": 4.843832023980392e-05, + "loss": 0.7672, + "step": 4805 + }, + { + "epoch": 0.07101712649374965, + "grad_norm": 1.8714436292648315, + "learning_rate": 4.840897082510106e-05, + "loss": 0.7661, + "step": 4836 + }, + { + "epoch": 0.0714723644840942, + "grad_norm": 1.5175424814224243, + "learning_rate": 4.8379357243344084e-05, + "loss": 0.7709, + "step": 4867 + }, + { + "epoch": 0.07192760247443876, + "grad_norm": 1.3103501796722412, + "learning_rate": 4.8349479828718236e-05, + "loss": 0.7701, + "step": 4898 + }, + { + "epoch": 0.0723828404647833, + "grad_norm": 1.4692376852035522, + "learning_rate": 4.8319338918386075e-05, + "loss": 0.7602, + "step": 4929 + }, + { + "epoch": 0.07283807845512785, + "grad_norm": 1.390416145324707, + "learning_rate": 4.828893485248369e-05, + "loss": 0.7896, + "step": 4960 + }, + { + "epoch": 0.0732933164454724, + "grad_norm": 1.4157830476760864, + "learning_rate": 4.825826797411682e-05, + "loss": 0.7675, + "step": 4991 + }, + { + "epoch": 0.07374855443581695, + "grad_norm": 1.2405292987823486, + "learning_rate": 4.822733862935702e-05, + "loss": 0.7679, + "step": 5022 + }, + { + "epoch": 0.0742037924261615, + "grad_norm": 1.4902536869049072, + "learning_rate": 4.819614716723775e-05, + "loss": 0.7693, + "step": 5053 + }, + { + "epoch": 0.07465903041650605, + "grad_norm": 1.4391045570373535, + "learning_rate": 4.8164693939750425e-05, + "loss": 0.7699, + "step": 5084 + }, + { + "epoch": 0.07511426840685059, + "grad_norm": 1.4432806968688965, + "learning_rate": 4.813297930184042e-05, + "loss": 0.763, + "step": 5115 + }, + { + "epoch": 0.07556950639719515, + "grad_norm": 1.3630146980285645, + "learning_rate": 4.810100361140314e-05, + "loss": 0.7618, + "step": 5146 + }, + { + "epoch": 0.0760247443875397, + "grad_norm": 1.5038788318634033, + "learning_rate": 4.8068767229279885e-05, + "loss": 0.7685, + "step": 5177 + }, + { + "epoch": 0.07647998237788424, + "grad_norm": 1.4100326299667358, + "learning_rate": 4.8036270519253854e-05, + "loss": 0.7645, + "step": 5208 + }, + { + "epoch": 0.0769352203682288, + "grad_norm": 1.377821445465088, + "learning_rate": 4.8003513848046e-05, + "loss": 0.7643, + "step": 5239 + }, + { + "epoch": 0.07739045835857335, + "grad_norm": 1.3848881721496582, + "learning_rate": 4.79704975853109e-05, + "loss": 0.7711, + "step": 5270 + }, + { + "epoch": 0.07784569634891789, + "grad_norm": 1.6327494382858276, + "learning_rate": 4.793722210363262e-05, + "loss": 0.7591, + "step": 5301 + }, + { + "epoch": 0.07830093433926244, + "grad_norm": 1.51273512840271, + "learning_rate": 4.7903687778520414e-05, + "loss": 0.7697, + "step": 5332 + }, + { + "epoch": 0.078756172329607, + "grad_norm": 1.447487235069275, + "learning_rate": 4.7869894988404593e-05, + "loss": 0.7602, + "step": 5363 + }, + { + "epoch": 0.07921141031995153, + "grad_norm": 1.3227125406265259, + "learning_rate": 4.783584411463221e-05, + "loss": 0.7616, + "step": 5394 + }, + { + "epoch": 0.07966664831029609, + "grad_norm": 1.4407910108566284, + "learning_rate": 4.780153554146274e-05, + "loss": 0.7618, + "step": 5425 + }, + { + "epoch": 0.08012188630064064, + "grad_norm": 1.2349289655685425, + "learning_rate": 4.7766969656063766e-05, + "loss": 0.7688, + "step": 5456 + }, + { + "epoch": 0.08057712429098518, + "grad_norm": 1.3858132362365723, + "learning_rate": 4.773214684850662e-05, + "loss": 0.7517, + "step": 5487 + }, + { + "epoch": 0.08103236228132973, + "grad_norm": 1.2951774597167969, + "learning_rate": 4.769706751176193e-05, + "loss": 0.7588, + "step": 5518 + }, + { + "epoch": 0.08148760027167429, + "grad_norm": 1.2295372486114502, + "learning_rate": 4.7661732041695264e-05, + "loss": 0.7522, + "step": 5549 + }, + { + "epoch": 0.08194283826201883, + "grad_norm": 1.4258685111999512, + "learning_rate": 4.762614083706258e-05, + "loss": 0.7666, + "step": 5580 + }, + { + "epoch": 0.08239807625236338, + "grad_norm": 1.5323545932769775, + "learning_rate": 4.759029429950581e-05, + "loss": 0.7587, + "step": 5611 + }, + { + "epoch": 0.08285331424270793, + "grad_norm": 1.3712198734283447, + "learning_rate": 4.7554192833548235e-05, + "loss": 0.7597, + "step": 5642 + }, + { + "epoch": 0.08330855223305247, + "grad_norm": 1.389145016670227, + "learning_rate": 4.751783684659e-05, + "loss": 0.7484, + "step": 5673 + }, + { + "epoch": 0.08376379022339703, + "grad_norm": 1.466799020767212, + "learning_rate": 4.748122674890348e-05, + "loss": 0.7586, + "step": 5704 + }, + { + "epoch": 0.08421902821374158, + "grad_norm": 1.4522569179534912, + "learning_rate": 4.7444362953628654e-05, + "loss": 0.7615, + "step": 5735 + }, + { + "epoch": 0.08467426620408613, + "grad_norm": 1.6374691724777222, + "learning_rate": 4.7407245876768424e-05, + "loss": 0.7573, + "step": 5766 + }, + { + "epoch": 0.08512950419443067, + "grad_norm": 1.384748101234436, + "learning_rate": 4.736987593718397e-05, + "loss": 0.7516, + "step": 5797 + }, + { + "epoch": 0.08558474218477523, + "grad_norm": 1.366335153579712, + "learning_rate": 4.733225355658999e-05, + "loss": 0.7681, + "step": 5828 + }, + { + "epoch": 0.08603998017511978, + "grad_norm": 1.3855708837509155, + "learning_rate": 4.7294379159549926e-05, + "loss": 0.7557, + "step": 5859 + }, + { + "epoch": 0.08649521816546432, + "grad_norm": 1.3614453077316284, + "learning_rate": 4.725625317347119e-05, + "loss": 0.7624, + "step": 5890 + }, + { + "epoch": 0.08695045615580888, + "grad_norm": 1.2072994709014893, + "learning_rate": 4.7217876028600374e-05, + "loss": 0.7545, + "step": 5921 + }, + { + "epoch": 0.08740569414615343, + "grad_norm": 1.5530472993850708, + "learning_rate": 4.717924815801832e-05, + "loss": 0.7574, + "step": 5952 + }, + { + "epoch": 0.08786093213649797, + "grad_norm": 1.3522703647613525, + "learning_rate": 4.714036999763532e-05, + "loss": 0.7608, + "step": 5983 + }, + { + "epoch": 0.08831617012684252, + "grad_norm": 1.342637300491333, + "learning_rate": 4.7101241986186116e-05, + "loss": 0.7519, + "step": 6014 + }, + { + "epoch": 0.08877140811718708, + "grad_norm": 1.2783684730529785, + "learning_rate": 4.7061864565225e-05, + "loss": 0.7565, + "step": 6045 + }, + { + "epoch": 0.08922664610753162, + "grad_norm": 1.1245291233062744, + "learning_rate": 4.702223817912081e-05, + "loss": 0.7655, + "step": 6076 + }, + { + "epoch": 0.08968188409787617, + "grad_norm": 1.2777146100997925, + "learning_rate": 4.698236327505195e-05, + "loss": 0.7526, + "step": 6107 + }, + { + "epoch": 0.09013712208822072, + "grad_norm": 1.2739795446395874, + "learning_rate": 4.694224030300127e-05, + "loss": 0.7594, + "step": 6138 + }, + { + "epoch": 0.09059236007856526, + "grad_norm": 1.358684778213501, + "learning_rate": 4.690186971575107e-05, + "loss": 0.7599, + "step": 6169 + }, + { + "epoch": 0.09104759806890982, + "grad_norm": 1.2908906936645508, + "learning_rate": 4.6861251968877916e-05, + "loss": 0.7564, + "step": 6200 + }, + { + "epoch": 0.09150283605925437, + "grad_norm": 1.2198729515075684, + "learning_rate": 4.68203875207476e-05, + "loss": 0.7595, + "step": 6231 + }, + { + "epoch": 0.09195807404959891, + "grad_norm": 1.337134599685669, + "learning_rate": 4.677927683250983e-05, + "loss": 0.7512, + "step": 6262 + }, + { + "epoch": 0.09241331203994346, + "grad_norm": 1.4034923315048218, + "learning_rate": 4.6737920368093156e-05, + "loss": 0.7587, + "step": 6293 + }, + { + "epoch": 0.09286855003028802, + "grad_norm": 1.2584336996078491, + "learning_rate": 4.669631859419965e-05, + "loss": 0.7532, + "step": 6324 + }, + { + "epoch": 0.09332378802063256, + "grad_norm": 1.4327120780944824, + "learning_rate": 4.6654471980299676e-05, + "loss": 0.7528, + "step": 6355 + }, + { + "epoch": 0.09377902601097711, + "grad_norm": 1.386634111404419, + "learning_rate": 4.661238099862658e-05, + "loss": 0.7455, + "step": 6386 + }, + { + "epoch": 0.09423426400132166, + "grad_norm": 1.373677372932434, + "learning_rate": 4.657004612417138e-05, + "loss": 0.7519, + "step": 6417 + }, + { + "epoch": 0.0946895019916662, + "grad_norm": 1.3846200704574585, + "learning_rate": 4.6527467834677374e-05, + "loss": 0.7505, + "step": 6448 + }, + { + "epoch": 0.09514473998201076, + "grad_norm": 1.3237924575805664, + "learning_rate": 4.648464661063478e-05, + "loss": 0.7562, + "step": 6479 + }, + { + "epoch": 0.09559997797235531, + "grad_norm": 1.3368539810180664, + "learning_rate": 4.6441582935275264e-05, + "loss": 0.7523, + "step": 6510 + }, + { + "epoch": 0.09605521596269985, + "grad_norm": 1.1844121217727661, + "learning_rate": 4.6398277294566586e-05, + "loss": 0.7535, + "step": 6541 + }, + { + "epoch": 0.0965104539530444, + "grad_norm": 1.3107521533966064, + "learning_rate": 4.6354730177207e-05, + "loss": 0.7609, + "step": 6572 + }, + { + "epoch": 0.09696569194338896, + "grad_norm": 1.3678165674209595, + "learning_rate": 4.6310942074619787e-05, + "loss": 0.7513, + "step": 6603 + }, + { + "epoch": 0.0974209299337335, + "grad_norm": 1.2169839143753052, + "learning_rate": 4.626691348094777e-05, + "loss": 0.7464, + "step": 6634 + }, + { + "epoch": 0.09787616792407805, + "grad_norm": 1.5456466674804688, + "learning_rate": 4.622264489304762e-05, + "loss": 0.7431, + "step": 6665 + }, + { + "epoch": 0.0983314059144226, + "grad_norm": 1.3606510162353516, + "learning_rate": 4.617813681048434e-05, + "loss": 0.7481, + "step": 6696 + }, + { + "epoch": 0.09878664390476714, + "grad_norm": 1.1854835748672485, + "learning_rate": 4.61333897355256e-05, + "loss": 0.7491, + "step": 6727 + }, + { + "epoch": 0.0992418818951117, + "grad_norm": 1.1682158708572388, + "learning_rate": 4.608840417313604e-05, + "loss": 0.7433, + "step": 6758 + }, + { + "epoch": 0.09969711988545625, + "grad_norm": 1.3889473676681519, + "learning_rate": 4.6043180630971646e-05, + "loss": 0.7414, + "step": 6789 + }, + { + "epoch": 0.10015235787580079, + "grad_norm": 1.351807713508606, + "learning_rate": 4.599771961937391e-05, + "loss": 0.7501, + "step": 6820 + }, + { + "epoch": 0.10060759586614534, + "grad_norm": 1.4737526178359985, + "learning_rate": 4.5952021651364204e-05, + "loss": 0.7508, + "step": 6851 + }, + { + "epoch": 0.1010628338564899, + "grad_norm": 1.3223670721054077, + "learning_rate": 4.590608724263786e-05, + "loss": 0.7557, + "step": 6882 + }, + { + "epoch": 0.10151807184683444, + "grad_norm": 1.182531714439392, + "learning_rate": 4.585991691155845e-05, + "loss": 0.7595, + "step": 6913 + }, + { + "epoch": 0.10197330983717899, + "grad_norm": 1.3309706449508667, + "learning_rate": 4.581351117915188e-05, + "loss": 0.7516, + "step": 6944 + }, + { + "epoch": 0.10242854782752354, + "grad_norm": 1.1631003618240356, + "learning_rate": 4.5766870569100534e-05, + "loss": 0.7538, + "step": 6975 + }, + { + "epoch": 0.10288378581786808, + "grad_norm": 1.150242805480957, + "learning_rate": 4.571999560773736e-05, + "loss": 0.7583, + "step": 7006 + }, + { + "epoch": 0.10333902380821264, + "grad_norm": 1.2031753063201904, + "learning_rate": 4.5672886824039915e-05, + "loss": 0.743, + "step": 7037 + }, + { + "epoch": 0.10379426179855719, + "grad_norm": 1.3973835706710815, + "learning_rate": 4.5625544749624435e-05, + "loss": 0.7477, + "step": 7068 + }, + { + "epoch": 0.10424949978890174, + "grad_norm": 1.3755607604980469, + "learning_rate": 4.5577969918739794e-05, + "loss": 0.7424, + "step": 7099 + }, + { + "epoch": 0.10470473777924628, + "grad_norm": 1.2959696054458618, + "learning_rate": 4.5530162868261486e-05, + "loss": 0.7432, + "step": 7130 + }, + { + "epoch": 0.10515997576959084, + "grad_norm": 1.4843213558197021, + "learning_rate": 4.548212413768558e-05, + "loss": 0.7519, + "step": 7161 + }, + { + "epoch": 0.10561521375993539, + "grad_norm": 1.3697874546051025, + "learning_rate": 4.543385426912261e-05, + "loss": 0.7546, + "step": 7192 + }, + { + "epoch": 0.10607045175027993, + "grad_norm": 1.3163423538208008, + "learning_rate": 4.53853538072915e-05, + "loss": 0.7328, + "step": 7223 + }, + { + "epoch": 0.10652568974062449, + "grad_norm": 1.200897455215454, + "learning_rate": 4.533662329951336e-05, + "loss": 0.7438, + "step": 7254 + }, + { + "epoch": 0.10698092773096904, + "grad_norm": 1.3377361297607422, + "learning_rate": 4.528766329570536e-05, + "loss": 0.7478, + "step": 7285 + }, + { + "epoch": 0.10743616572131358, + "grad_norm": 1.2304787635803223, + "learning_rate": 4.523847434837447e-05, + "loss": 0.7468, + "step": 7316 + }, + { + "epoch": 0.10789140371165813, + "grad_norm": 1.1947201490402222, + "learning_rate": 4.518905701261128e-05, + "loss": 0.7483, + "step": 7347 + }, + { + "epoch": 0.10834664170200269, + "grad_norm": 1.1942620277404785, + "learning_rate": 4.5139411846083715e-05, + "loss": 0.7522, + "step": 7378 + }, + { + "epoch": 0.10880187969234723, + "grad_norm": 1.2765998840332031, + "learning_rate": 4.508953940903073e-05, + "loss": 0.7425, + "step": 7409 + }, + { + "epoch": 0.10925711768269178, + "grad_norm": 1.1460707187652588, + "learning_rate": 4.5039440264255994e-05, + "loss": 0.7534, + "step": 7440 + }, + { + "epoch": 0.10971235567303633, + "grad_norm": 1.1739740371704102, + "learning_rate": 4.498911497712155e-05, + "loss": 0.7447, + "step": 7471 + }, + { + "epoch": 0.11016759366338087, + "grad_norm": 1.227232575416565, + "learning_rate": 4.493856411554142e-05, + "loss": 0.7452, + "step": 7502 + }, + { + "epoch": 0.11062283165372543, + "grad_norm": 1.3812710046768188, + "learning_rate": 4.4887788249975206e-05, + "loss": 0.7431, + "step": 7533 + }, + { + "epoch": 0.11107806964406998, + "grad_norm": 1.1743565797805786, + "learning_rate": 4.4836787953421656e-05, + "loss": 0.7421, + "step": 7564 + }, + { + "epoch": 0.11153330763441452, + "grad_norm": 1.2436307668685913, + "learning_rate": 4.478556380141218e-05, + "loss": 0.7461, + "step": 7595 + }, + { + "epoch": 0.11198854562475907, + "grad_norm": 1.2472264766693115, + "learning_rate": 4.4734116372004375e-05, + "loss": 0.7456, + "step": 7626 + }, + { + "epoch": 0.11244378361510363, + "grad_norm": 1.2209473848342896, + "learning_rate": 4.4682446245775477e-05, + "loss": 0.7544, + "step": 7657 + }, + { + "epoch": 0.11289902160544817, + "grad_norm": 1.3582866191864014, + "learning_rate": 4.463055400581586e-05, + "loss": 0.7464, + "step": 7688 + }, + { + "epoch": 0.11335425959579272, + "grad_norm": 1.2489479780197144, + "learning_rate": 4.4578440237722374e-05, + "loss": 0.7394, + "step": 7719 + }, + { + "epoch": 0.11380949758613727, + "grad_norm": 1.2384752035140991, + "learning_rate": 4.452610552959183e-05, + "loss": 0.7358, + "step": 7750 + }, + { + "epoch": 0.11426473557648181, + "grad_norm": 1.3618046045303345, + "learning_rate": 4.447355047201428e-05, + "loss": 0.742, + "step": 7781 + }, + { + "epoch": 0.11471997356682637, + "grad_norm": 1.2598398923873901, + "learning_rate": 4.4420775658066414e-05, + "loss": 0.7407, + "step": 7812 + }, + { + "epoch": 0.11517521155717092, + "grad_norm": 1.1453444957733154, + "learning_rate": 4.436778168330484e-05, + "loss": 0.7499, + "step": 7843 + }, + { + "epoch": 0.11563044954751546, + "grad_norm": 1.3463783264160156, + "learning_rate": 4.4314569145759353e-05, + "loss": 0.7388, + "step": 7874 + }, + { + "epoch": 0.11608568753786001, + "grad_norm": 1.2431267499923706, + "learning_rate": 4.42611386459262e-05, + "loss": 0.7487, + "step": 7905 + }, + { + "epoch": 0.11654092552820457, + "grad_norm": 1.2726207971572876, + "learning_rate": 4.420749078676133e-05, + "loss": 0.7468, + "step": 7936 + }, + { + "epoch": 0.1169961635185491, + "grad_norm": 1.1979873180389404, + "learning_rate": 4.4153626173673516e-05, + "loss": 0.7482, + "step": 7967 + }, + { + "epoch": 0.11745140150889366, + "grad_norm": 1.2321964502334595, + "learning_rate": 4.409954541451762e-05, + "loss": 0.7363, + "step": 7998 + }, + { + "epoch": 0.11790663949923821, + "grad_norm": 1.265199065208435, + "learning_rate": 4.404524911958764e-05, + "loss": 0.7466, + "step": 8029 + }, + { + "epoch": 0.11836187748958275, + "grad_norm": 1.3418115377426147, + "learning_rate": 4.399073790160989e-05, + "loss": 0.7433, + "step": 8060 + }, + { + "epoch": 0.11881711547992731, + "grad_norm": 1.276666522026062, + "learning_rate": 4.393601237573607e-05, + "loss": 0.7457, + "step": 8091 + }, + { + "epoch": 0.11927235347027186, + "grad_norm": 1.3559373617172241, + "learning_rate": 4.388107315953628e-05, + "loss": 0.7451, + "step": 8122 + }, + { + "epoch": 0.1197275914606164, + "grad_norm": 1.3017044067382812, + "learning_rate": 4.382592087299212e-05, + "loss": 0.736, + "step": 8153 + }, + { + "epoch": 0.12018282945096095, + "grad_norm": 1.230873703956604, + "learning_rate": 4.377055613848964e-05, + "loss": 0.7409, + "step": 8184 + }, + { + "epoch": 0.12063806744130551, + "grad_norm": 1.3415979146957397, + "learning_rate": 4.3714979580812355e-05, + "loss": 0.7456, + "step": 8215 + }, + { + "epoch": 0.12109330543165005, + "grad_norm": 1.3706104755401611, + "learning_rate": 4.365919182713416e-05, + "loss": 0.7425, + "step": 8246 + }, + { + "epoch": 0.1215485434219946, + "grad_norm": 1.3071256875991821, + "learning_rate": 4.360319350701226e-05, + "loss": 0.7312, + "step": 8277 + }, + { + "epoch": 0.12200378141233915, + "grad_norm": 1.1925092935562134, + "learning_rate": 4.3546985252380115e-05, + "loss": 0.7457, + "step": 8308 + }, + { + "epoch": 0.12245901940268371, + "grad_norm": 1.2254852056503296, + "learning_rate": 4.349056769754021e-05, + "loss": 0.7454, + "step": 8339 + }, + { + "epoch": 0.12291425739302825, + "grad_norm": 1.2121927738189697, + "learning_rate": 4.3433941479156994e-05, + "loss": 0.7451, + "step": 8370 + }, + { + "epoch": 0.1233694953833728, + "grad_norm": 1.4518349170684814, + "learning_rate": 4.3377107236249647e-05, + "loss": 0.7376, + "step": 8401 + }, + { + "epoch": 0.12382473337371736, + "grad_norm": 1.2487318515777588, + "learning_rate": 4.332006561018488e-05, + "loss": 0.7468, + "step": 8432 + }, + { + "epoch": 0.1242799713640619, + "grad_norm": 1.2601664066314697, + "learning_rate": 4.3262817244669683e-05, + "loss": 0.7341, + "step": 8463 + }, + { + "epoch": 0.12473520935440645, + "grad_norm": 2.2060904502868652, + "learning_rate": 4.3205362785744083e-05, + "loss": 0.7421, + "step": 8494 + }, + { + "epoch": 0.125190447344751, + "grad_norm": 1.2388501167297363, + "learning_rate": 4.314770288177384e-05, + "loss": 0.7372, + "step": 8525 + }, + { + "epoch": 0.12564568533509554, + "grad_norm": 1.20314359664917, + "learning_rate": 4.308983818344313e-05, + "loss": 0.7308, + "step": 8556 + }, + { + "epoch": 0.1261009233254401, + "grad_norm": 1.2387797832489014, + "learning_rate": 4.3031769343747206e-05, + "loss": 0.7364, + "step": 8587 + }, + { + "epoch": 0.12655616131578465, + "grad_norm": 1.2150596380233765, + "learning_rate": 4.297349701798505e-05, + "loss": 0.7339, + "step": 8618 + }, + { + "epoch": 0.1270113993061292, + "grad_norm": 1.3160388469696045, + "learning_rate": 4.2915021863751916e-05, + "loss": 0.7448, + "step": 8649 + }, + { + "epoch": 0.12746663729647373, + "grad_norm": 1.2065149545669556, + "learning_rate": 4.285634454093198e-05, + "loss": 0.7373, + "step": 8680 + }, + { + "epoch": 0.12792187528681828, + "grad_norm": 1.2958154678344727, + "learning_rate": 4.279746571169086e-05, + "loss": 0.7282, + "step": 8711 + }, + { + "epoch": 0.12837711327716284, + "grad_norm": 1.2723522186279297, + "learning_rate": 4.2738386040468136e-05, + "loss": 0.7274, + "step": 8742 + }, + { + "epoch": 0.1288323512675074, + "grad_norm": 1.1709808111190796, + "learning_rate": 4.2679106193969866e-05, + "loss": 0.7459, + "step": 8773 + }, + { + "epoch": 0.12928758925785194, + "grad_norm": 1.3450511693954468, + "learning_rate": 4.261962684116106e-05, + "loss": 0.7334, + "step": 8804 + }, + { + "epoch": 0.1297428272481965, + "grad_norm": 1.3592984676361084, + "learning_rate": 4.2559948653258145e-05, + "loss": 0.7415, + "step": 8835 + }, + { + "epoch": 0.13019806523854102, + "grad_norm": 1.2315089702606201, + "learning_rate": 4.250007230372134e-05, + "loss": 0.7296, + "step": 8866 + }, + { + "epoch": 0.13065330322888558, + "grad_norm": 1.0693110227584839, + "learning_rate": 4.2439998468247126e-05, + "loss": 0.7269, + "step": 8897 + }, + { + "epoch": 0.13110854121923013, + "grad_norm": 1.0341922044754028, + "learning_rate": 4.2379727824760566e-05, + "loss": 0.7257, + "step": 8928 + }, + { + "epoch": 0.13156377920957468, + "grad_norm": 1.3500678539276123, + "learning_rate": 4.231926105340768e-05, + "loss": 0.7379, + "step": 8959 + }, + { + "epoch": 0.13201901719991924, + "grad_norm": 1.3482787609100342, + "learning_rate": 4.225859883654776e-05, + "loss": 0.7393, + "step": 8990 + }, + { + "epoch": 0.1324742551902638, + "grad_norm": 1.3126060962677002, + "learning_rate": 4.219774185874569e-05, + "loss": 0.7391, + "step": 9021 + }, + { + "epoch": 0.13292949318060834, + "grad_norm": 1.2438606023788452, + "learning_rate": 4.213669080676418e-05, + "loss": 0.7347, + "step": 9052 + }, + { + "epoch": 0.13338473117095287, + "grad_norm": 1.3756070137023926, + "learning_rate": 4.2075446369556056e-05, + "loss": 0.7321, + "step": 9083 + }, + { + "epoch": 0.13383996916129742, + "grad_norm": 1.189840316772461, + "learning_rate": 4.201400923825648e-05, + "loss": 0.7352, + "step": 9114 + }, + { + "epoch": 0.13429520715164198, + "grad_norm": 1.1982426643371582, + "learning_rate": 4.195238010617511e-05, + "loss": 0.7388, + "step": 9145 + }, + { + "epoch": 0.13475044514198653, + "grad_norm": 1.226491928100586, + "learning_rate": 4.1890559668788344e-05, + "loss": 0.7349, + "step": 9176 + }, + { + "epoch": 0.13520568313233108, + "grad_norm": 1.3030654191970825, + "learning_rate": 4.1828548623731405e-05, + "loss": 0.7371, + "step": 9207 + }, + { + "epoch": 0.13566092112267564, + "grad_norm": 1.17802894115448, + "learning_rate": 4.1766347670790506e-05, + "loss": 0.7431, + "step": 9238 + }, + { + "epoch": 0.13611615911302016, + "grad_norm": 1.2893658876419067, + "learning_rate": 4.170395751189495e-05, + "loss": 0.7258, + "step": 9269 + }, + { + "epoch": 0.13657139710336472, + "grad_norm": 1.193758487701416, + "learning_rate": 4.164137885110921e-05, + "loss": 0.7362, + "step": 9300 + }, + { + "epoch": 0.13702663509370927, + "grad_norm": 1.2750978469848633, + "learning_rate": 4.157861239462495e-05, + "loss": 0.7265, + "step": 9331 + }, + { + "epoch": 0.13748187308405382, + "grad_norm": 1.161464810371399, + "learning_rate": 4.1515658850753114e-05, + "loss": 0.7389, + "step": 9362 + }, + { + "epoch": 0.13793711107439838, + "grad_norm": 1.2749390602111816, + "learning_rate": 4.145251892991588e-05, + "loss": 0.7315, + "step": 9393 + }, + { + "epoch": 0.13839234906474293, + "grad_norm": 1.1729388236999512, + "learning_rate": 4.138919334463868e-05, + "loss": 0.7352, + "step": 9424 + }, + { + "epoch": 0.13884758705508746, + "grad_norm": 1.0987122058868408, + "learning_rate": 4.1325682809542124e-05, + "loss": 0.7347, + "step": 9455 + }, + { + "epoch": 0.139302825045432, + "grad_norm": 1.0855766534805298, + "learning_rate": 4.126198804133398e-05, + "loss": 0.7271, + "step": 9486 + }, + { + "epoch": 0.13975806303577656, + "grad_norm": 1.1782172918319702, + "learning_rate": 4.1198109758801055e-05, + "loss": 0.74, + "step": 9517 + }, + { + "epoch": 0.14021330102612112, + "grad_norm": 1.186962366104126, + "learning_rate": 4.113404868280107e-05, + "loss": 0.725, + "step": 9548 + }, + { + "epoch": 0.14066853901646567, + "grad_norm": 1.2993977069854736, + "learning_rate": 4.106980553625457e-05, + "loss": 0.7346, + "step": 9579 + }, + { + "epoch": 0.14112377700681022, + "grad_norm": 1.154897928237915, + "learning_rate": 4.100538104413674e-05, + "loss": 0.7348, + "step": 9610 + }, + { + "epoch": 0.14157901499715475, + "grad_norm": 1.203722357749939, + "learning_rate": 4.09407759334692e-05, + "loss": 0.7312, + "step": 9641 + }, + { + "epoch": 0.1420342529874993, + "grad_norm": 1.102858543395996, + "learning_rate": 4.087599093331186e-05, + "loss": 0.7227, + "step": 9672 + }, + { + "epoch": 0.14248949097784386, + "grad_norm": 1.2667406797409058, + "learning_rate": 4.081102677475462e-05, + "loss": 0.727, + "step": 9703 + }, + { + "epoch": 0.1429447289681884, + "grad_norm": 1.221291422843933, + "learning_rate": 4.0745884190909194e-05, + "loss": 0.7317, + "step": 9734 + }, + { + "epoch": 0.14339996695853297, + "grad_norm": 1.0426162481307983, + "learning_rate": 4.0680563916900796e-05, + "loss": 0.7299, + "step": 9765 + }, + { + "epoch": 0.14385520494887752, + "grad_norm": 1.2158912420272827, + "learning_rate": 4.0615066689859815e-05, + "loss": 0.7399, + "step": 9796 + }, + { + "epoch": 0.14431044293922204, + "grad_norm": 1.2689307928085327, + "learning_rate": 4.0549393248913584e-05, + "loss": 0.721, + "step": 9827 + }, + { + "epoch": 0.1447656809295666, + "grad_norm": 1.3168833255767822, + "learning_rate": 4.048354433517794e-05, + "loss": 0.7258, + "step": 9858 + }, + { + "epoch": 0.14522091891991115, + "grad_norm": 1.1966122388839722, + "learning_rate": 4.0417520691748916e-05, + "loss": 0.7375, + "step": 9889 + }, + { + "epoch": 0.1456761569102557, + "grad_norm": 1.1252835988998413, + "learning_rate": 4.035132306369438e-05, + "loss": 0.7254, + "step": 9920 + }, + { + "epoch": 0.14613139490060026, + "grad_norm": 1.186324119567871, + "learning_rate": 4.028495219804555e-05, + "loss": 0.7337, + "step": 9951 + }, + { + "epoch": 0.1465866328909448, + "grad_norm": 1.1606496572494507, + "learning_rate": 4.021840884378864e-05, + "loss": 0.7273, + "step": 9982 + }, + { + "epoch": 0.14704187088128934, + "grad_norm": 1.316298007965088, + "learning_rate": 4.015169375185633e-05, + "loss": 0.7276, + "step": 10013 + }, + { + "epoch": 0.1474971088716339, + "grad_norm": 1.224236249923706, + "learning_rate": 4.0084807675119396e-05, + "loss": 0.7328, + "step": 10044 + }, + { + "epoch": 0.14795234686197845, + "grad_norm": 1.227148175239563, + "learning_rate": 4.0017751368378106e-05, + "loss": 0.739, + "step": 10075 + }, + { + "epoch": 0.148407584852323, + "grad_norm": 1.1307867765426636, + "learning_rate": 3.995052558835377e-05, + "loss": 0.7324, + "step": 10106 + }, + { + "epoch": 0.14886282284266755, + "grad_norm": 1.2655375003814697, + "learning_rate": 3.988313109368017e-05, + "loss": 0.7263, + "step": 10137 + }, + { + "epoch": 0.1493180608330121, + "grad_norm": 1.1724634170532227, + "learning_rate": 3.981556864489504e-05, + "loss": 0.7289, + "step": 10168 + }, + { + "epoch": 0.14977329882335666, + "grad_norm": 1.140684723854065, + "learning_rate": 3.974783900443142e-05, + "loss": 0.7309, + "step": 10199 + }, + { + "epoch": 0.15022853681370119, + "grad_norm": 1.167183756828308, + "learning_rate": 3.9679942936609095e-05, + "loss": 0.729, + "step": 10230 + }, + { + "epoch": 0.15068377480404574, + "grad_norm": 1.2216546535491943, + "learning_rate": 3.961188120762596e-05, + "loss": 0.7288, + "step": 10261 + }, + { + "epoch": 0.1511390127943903, + "grad_norm": 1.227397084236145, + "learning_rate": 3.954365458554938e-05, + "loss": 0.7313, + "step": 10292 + }, + { + "epoch": 0.15159425078473485, + "grad_norm": 1.075441837310791, + "learning_rate": 3.947526384030751e-05, + "loss": 0.7306, + "step": 10323 + }, + { + "epoch": 0.1520494887750794, + "grad_norm": 1.1227167844772339, + "learning_rate": 3.9406709743680624e-05, + "loss": 0.7287, + "step": 10354 + }, + { + "epoch": 0.15250472676542395, + "grad_norm": 1.0665740966796875, + "learning_rate": 3.9337993069292366e-05, + "loss": 0.7246, + "step": 10385 + }, + { + "epoch": 0.15295996475576848, + "grad_norm": 1.250580072402954, + "learning_rate": 3.926911459260109e-05, + "loss": 0.7183, + "step": 10416 + }, + { + "epoch": 0.15341520274611303, + "grad_norm": 1.1809351444244385, + "learning_rate": 3.920007509089102e-05, + "loss": 0.7307, + "step": 10447 + }, + { + "epoch": 0.1538704407364576, + "grad_norm": 1.3934037685394287, + "learning_rate": 3.913087534326357e-05, + "loss": 0.7151, + "step": 10478 + }, + { + "epoch": 0.15432567872680214, + "grad_norm": 1.1272441148757935, + "learning_rate": 3.9061516130628475e-05, + "loss": 0.7309, + "step": 10509 + }, + { + "epoch": 0.1547809167171467, + "grad_norm": 1.2410191297531128, + "learning_rate": 3.8991998235695025e-05, + "loss": 0.7375, + "step": 10540 + }, + { + "epoch": 0.15523615470749125, + "grad_norm": 1.1426646709442139, + "learning_rate": 3.8922322442963224e-05, + "loss": 0.7342, + "step": 10571 + }, + { + "epoch": 0.15569139269783577, + "grad_norm": 1.2037526369094849, + "learning_rate": 3.885248953871491e-05, + "loss": 0.7329, + "step": 10602 + }, + { + "epoch": 0.15614663068818033, + "grad_norm": 1.2198660373687744, + "learning_rate": 3.8782500311004915e-05, + "loss": 0.7367, + "step": 10633 + }, + { + "epoch": 0.15660186867852488, + "grad_norm": 1.3129019737243652, + "learning_rate": 3.871235554965218e-05, + "loss": 0.7236, + "step": 10664 + }, + { + "epoch": 0.15705710666886943, + "grad_norm": 1.2182697057724, + "learning_rate": 3.864205604623078e-05, + "loss": 0.721, + "step": 10695 + }, + { + "epoch": 0.157512344659214, + "grad_norm": 1.1494821310043335, + "learning_rate": 3.857160259406107e-05, + "loss": 0.7223, + "step": 10726 + }, + { + "epoch": 0.15796758264955854, + "grad_norm": 1.0680466890335083, + "learning_rate": 3.8500995988200674e-05, + "loss": 0.7224, + "step": 10757 + }, + { + "epoch": 0.15842282063990307, + "grad_norm": 1.155052661895752, + "learning_rate": 3.843023702543556e-05, + "loss": 0.7268, + "step": 10788 + }, + { + "epoch": 0.15887805863024762, + "grad_norm": 1.1415716409683228, + "learning_rate": 3.8359326504270984e-05, + "loss": 0.7167, + "step": 10819 + }, + { + "epoch": 0.15933329662059217, + "grad_norm": 1.1486759185791016, + "learning_rate": 3.828826522492255e-05, + "loss": 0.7236, + "step": 10850 + }, + { + "epoch": 0.15978853461093673, + "grad_norm": 1.1916829347610474, + "learning_rate": 3.821705398930713e-05, + "loss": 0.7269, + "step": 10881 + }, + { + "epoch": 0.16024377260128128, + "grad_norm": 1.180760383605957, + "learning_rate": 3.814569360103385e-05, + "loss": 0.724, + "step": 10912 + }, + { + "epoch": 0.16069901059162583, + "grad_norm": 1.197651982307434, + "learning_rate": 3.807418486539499e-05, + "loss": 0.7265, + "step": 10943 + }, + { + "epoch": 0.16115424858197036, + "grad_norm": 1.1579455137252808, + "learning_rate": 3.80025285893569e-05, + "loss": 0.7247, + "step": 10974 + }, + { + "epoch": 0.16160948657231491, + "grad_norm": 1.1810616254806519, + "learning_rate": 3.793072558155093e-05, + "loss": 0.7256, + "step": 11005 + }, + { + "epoch": 0.16206472456265947, + "grad_norm": 1.1510332822799683, + "learning_rate": 3.785877665226426e-05, + "loss": 0.722, + "step": 11036 + }, + { + "epoch": 0.16251996255300402, + "grad_norm": 1.0499261617660522, + "learning_rate": 3.778668261343079e-05, + "loss": 0.7255, + "step": 11067 + }, + { + "epoch": 0.16297520054334858, + "grad_norm": 1.1509817838668823, + "learning_rate": 3.771444427862192e-05, + "loss": 0.7147, + "step": 11098 + }, + { + "epoch": 0.16343043853369313, + "grad_norm": 1.1638833284378052, + "learning_rate": 3.7642062463037465e-05, + "loss": 0.7187, + "step": 11129 + }, + { + "epoch": 0.16388567652403765, + "grad_norm": 1.024095058441162, + "learning_rate": 3.7569537983496373e-05, + "loss": 0.7244, + "step": 11160 + }, + { + "epoch": 0.1643409145143822, + "grad_norm": 1.1326078176498413, + "learning_rate": 3.749687165842753e-05, + "loss": 0.7221, + "step": 11191 + }, + { + "epoch": 0.16479615250472676, + "grad_norm": 1.1863468885421753, + "learning_rate": 3.7424064307860536e-05, + "loss": 0.725, + "step": 11222 + }, + { + "epoch": 0.16525139049507132, + "grad_norm": 1.2948099374771118, + "learning_rate": 3.735111675341645e-05, + "loss": 0.7249, + "step": 11253 + }, + { + "epoch": 0.16570662848541587, + "grad_norm": 1.1384631395339966, + "learning_rate": 3.7278029818298524e-05, + "loss": 0.7203, + "step": 11284 + }, + { + "epoch": 0.16616186647576042, + "grad_norm": 1.1335748434066772, + "learning_rate": 3.720480432728287e-05, + "loss": 0.7183, + "step": 11315 + }, + { + "epoch": 0.16661710446610495, + "grad_norm": 1.2514820098876953, + "learning_rate": 3.71314411067092e-05, + "loss": 0.7216, + "step": 11346 + }, + { + "epoch": 0.1670723424564495, + "grad_norm": 1.128739833831787, + "learning_rate": 3.70579409844715e-05, + "loss": 0.7234, + "step": 11377 + }, + { + "epoch": 0.16752758044679406, + "grad_norm": 1.152342677116394, + "learning_rate": 3.698430479000865e-05, + "loss": 0.7304, + "step": 11408 + }, + { + "epoch": 0.1679828184371386, + "grad_norm": 1.222424864768982, + "learning_rate": 3.691053335429509e-05, + "loss": 0.7164, + "step": 11439 + }, + { + "epoch": 0.16843805642748316, + "grad_norm": 1.2406268119812012, + "learning_rate": 3.683662750983147e-05, + "loss": 0.7157, + "step": 11470 + }, + { + "epoch": 0.16889329441782772, + "grad_norm": 1.1266759634017944, + "learning_rate": 3.676258809063518e-05, + "loss": 0.7228, + "step": 11501 + }, + { + "epoch": 0.16934853240817227, + "grad_norm": 1.1239662170410156, + "learning_rate": 3.6688415932231004e-05, + "loss": 0.7266, + "step": 11532 + }, + { + "epoch": 0.1698037703985168, + "grad_norm": 1.0813047885894775, + "learning_rate": 3.661411187164166e-05, + "loss": 0.7278, + "step": 11563 + }, + { + "epoch": 0.17025900838886135, + "grad_norm": 56.909889221191406, + "learning_rate": 3.65396767473784e-05, + "loss": 0.7163, + "step": 11594 + }, + { + "epoch": 0.1707142463792059, + "grad_norm": 1.0749647617340088, + "learning_rate": 3.6465111399431465e-05, + "loss": 0.7182, + "step": 11625 + }, + { + "epoch": 0.17116948436955046, + "grad_norm": 1.0705510377883911, + "learning_rate": 3.6390416669260674e-05, + "loss": 0.726, + "step": 11656 + }, + { + "epoch": 0.171624722359895, + "grad_norm": 1.1083126068115234, + "learning_rate": 3.63155933997859e-05, + "loss": 0.7262, + "step": 11687 + }, + { + "epoch": 0.17207996035023956, + "grad_norm": 1.2302770614624023, + "learning_rate": 3.624064243537758e-05, + "loss": 0.7229, + "step": 11718 + }, + { + "epoch": 0.1725351983405841, + "grad_norm": 1.240893006324768, + "learning_rate": 3.616556462184716e-05, + "loss": 0.7228, + "step": 11749 + }, + { + "epoch": 0.17299043633092864, + "grad_norm": 1.1897794008255005, + "learning_rate": 3.609036080643755e-05, + "loss": 0.7178, + "step": 11780 + }, + { + "epoch": 0.1734456743212732, + "grad_norm": 1.244950294494629, + "learning_rate": 3.60150318378136e-05, + "loss": 0.7244, + "step": 11811 + }, + { + "epoch": 0.17390091231161775, + "grad_norm": 1.1689528226852417, + "learning_rate": 3.5939578566052465e-05, + "loss": 0.7181, + "step": 11842 + }, + { + "epoch": 0.1743561503019623, + "grad_norm": 1.1300387382507324, + "learning_rate": 3.586400184263408e-05, + "loss": 0.7176, + "step": 11873 + }, + { + "epoch": 0.17481138829230686, + "grad_norm": 1.0884469747543335, + "learning_rate": 3.578830252043148e-05, + "loss": 0.7182, + "step": 11904 + }, + { + "epoch": 0.17526662628265138, + "grad_norm": 1.119352102279663, + "learning_rate": 3.571248145370125e-05, + "loss": 0.7223, + "step": 11935 + }, + { + "epoch": 0.17572186427299594, + "grad_norm": 1.0979810953140259, + "learning_rate": 3.5636539498073794e-05, + "loss": 0.7233, + "step": 11966 + }, + { + "epoch": 0.1761771022633405, + "grad_norm": 1.138571858406067, + "learning_rate": 3.556047751054378e-05, + "loss": 0.7279, + "step": 11997 + }, + { + "epoch": 0.17663234025368504, + "grad_norm": 1.2011150121688843, + "learning_rate": 3.548429634946039e-05, + "loss": 0.717, + "step": 12028 + }, + { + "epoch": 0.1770875782440296, + "grad_norm": 1.119472861289978, + "learning_rate": 3.540799687451768e-05, + "loss": 0.7227, + "step": 12059 + }, + { + "epoch": 0.17754281623437415, + "grad_norm": 1.089117169380188, + "learning_rate": 3.533157994674485e-05, + "loss": 0.7132, + "step": 12090 + }, + { + "epoch": 0.17799805422471868, + "grad_norm": 1.0937649011611938, + "learning_rate": 3.5255046428496546e-05, + "loss": 0.7285, + "step": 12121 + }, + { + "epoch": 0.17845329221506323, + "grad_norm": 1.0359808206558228, + "learning_rate": 3.517839718344311e-05, + "loss": 0.7178, + "step": 12152 + }, + { + "epoch": 0.17890853020540778, + "grad_norm": 1.1309690475463867, + "learning_rate": 3.510163307656086e-05, + "loss": 0.713, + "step": 12183 + } + ], + "logging_steps": 31, + "max_steps": 30517, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 3052, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 9.006624315386364e+18, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-12208/training_args.bin b/checkpoint-12208/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..dd0c30645908fd59ad9d6091c5e2e08018856b49 --- /dev/null +++ b/checkpoint-12208/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2d2ae676cb4770c8405bd6fc6962f1ef1289d669b20872c9a947682a2673db22 +size 5304 diff --git a/checkpoint-15260/config.json b/checkpoint-15260/config.json new file mode 100644 index 0000000000000000000000000000000000000000..1a2e378a7091e0c0c8eaa10d0bd7796835fe4b64 --- /dev/null +++ b/checkpoint-15260/config.json @@ -0,0 +1,39 @@ +{ + "_name_or_path": "meta-llama/Llama-3.1-8B-Instruct", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 8.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "torch_dtype": "float32", + "transformers_version": "4.44.0.dev0", + "use_cache": true, + "vocab_size": 128256 +} diff --git a/checkpoint-15260/generation_config.json b/checkpoint-15260/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c0d952a3697e90a24460d766a494a8bb9cf80534 --- /dev/null +++ b/checkpoint-15260/generation_config.json @@ -0,0 +1,12 @@ +{ + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.44.0.dev0" +} diff --git a/checkpoint-15260/model-00001-of-00007.safetensors b/checkpoint-15260/model-00001-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..d199e823586a0c51e94259c2559a763c12f6e6db --- /dev/null +++ b/checkpoint-15260/model-00001-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3b4f523c77bb5dcb0b65b8318d392741111930986f6b93674dc0aa109c3901bd +size 4886466168 diff --git a/checkpoint-15260/model-00002-of-00007.safetensors b/checkpoint-15260/model-00002-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..fbc8e97904ed5acdba6240901a52d7ea3c73628d --- /dev/null +++ b/checkpoint-15260/model-00002-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:23fa5aaf8ac56c17c821980c9aee44fcbc2a617fc570e875e6fdb93d92886c72 +size 4832007448 diff --git a/checkpoint-15260/model-00003-of-00007.safetensors b/checkpoint-15260/model-00003-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..aeab8b855338147792249baf9c680bfb29ea42a5 --- /dev/null +++ b/checkpoint-15260/model-00003-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a223e87240e50dfb05634ea909cf1ec715d37a12b681b14a3a6e97f28688f2b +size 4999813112 diff --git a/checkpoint-15260/model-00004-of-00007.safetensors b/checkpoint-15260/model-00004-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ac9e7b6b5edbe63eec2dd4c7d3310db1f10fdda9 --- /dev/null +++ b/checkpoint-15260/model-00004-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3047d199c611c6d12b1cca780f033ad89df9c076f742674f7ac4a892ca9573b3 +size 4999813128 diff --git a/checkpoint-15260/model-00005-of-00007.safetensors b/checkpoint-15260/model-00005-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..48718503a952853ffd5e231b9dba74b7026e0e01 --- /dev/null +++ b/checkpoint-15260/model-00005-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a2f91d749d664ecd22c71f3d54488966fc6899a03924b8fea3613c8f83398a8 +size 4832007496 diff --git a/checkpoint-15260/model-00006-of-00007.safetensors b/checkpoint-15260/model-00006-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..bf86271759a8c4e9ecef0f2e08632a81653d301a --- /dev/null +++ b/checkpoint-15260/model-00006-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:684f13d566a6c91fe8134f87e6154be35f0b21323c9a6ea2fc715444a1ecc99e +size 4999813120 diff --git a/checkpoint-15260/model-00007-of-00007.safetensors b/checkpoint-15260/model-00007-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..d944ed57692c39911bdf55e206f85c27c4a2ad2d --- /dev/null +++ b/checkpoint-15260/model-00007-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:585a0c69b806d889b748d48d77931dead35fcf030731d93a63979b6cbe24a125 +size 2571158184 diff --git a/checkpoint-15260/model.safetensors.index.json b/checkpoint-15260/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..0985084320f65a678bea622edb50d3771af4fd13 --- /dev/null +++ b/checkpoint-15260/model.safetensors.index.json @@ -0,0 +1,298 @@ +{ + "metadata": { + "total_size": 32121044992 + }, + "weight_map": { + "lm_head.weight": "model-00007-of-00007.safetensors", + "model.embed_tokens.weight": "model-00001-of-00007.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.10.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.15.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.20.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.21.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.26.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.3.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.30.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.input_layernorm.weight": "model-00007-of-00007.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00007-of-00007.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00007-of-00007.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00007-of-00007.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.4.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.9.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.norm.weight": "model-00007-of-00007.safetensors" + } +} diff --git a/checkpoint-15260/optimizer.pt b/checkpoint-15260/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..bd5263cad5b71c7a62a66eb274ba8ab45287ec00 --- /dev/null +++ b/checkpoint-15260/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:834b4b424f4fb937ab7636998eb4186384e9ebab058f8cff97f10fa67b1a1f32 +size 15385036334 diff --git a/checkpoint-15260/rng_state.pth b/checkpoint-15260/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..897ad19948758b40c785c678c136c4029433a27a --- /dev/null +++ b/checkpoint-15260/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d9cd6a0487226e5bd30d1846894c82af483733ab4381b75bae9c0745e05d405 +size 14244 diff --git a/checkpoint-15260/scheduler.pt b/checkpoint-15260/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..38f65fadce527ecb01fb7d7fd7ef901b9b082813 --- /dev/null +++ b/checkpoint-15260/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2ce335347efaebf038b8915bc0a8d2d587a4a1aa08ad0e015b4bc7cc4fba634e +size 1064 diff --git a/checkpoint-15260/trainer_state.json b/checkpoint-15260/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..d3419b0d2ea864654f5d8c3336af7971db235857 --- /dev/null +++ b/checkpoint-15260/trainer_state.json @@ -0,0 +1,3477 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.22409457202121996, + "eval_steps": 500, + "global_step": 15260, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0004552379903445491, + "grad_norm": 4.0020222663879395, + "learning_rate": 1.0157273918741808e-06, + "loss": 0.92, + "step": 31 + }, + { + "epoch": 0.0009104759806890982, + "grad_norm": 3.06746506690979, + "learning_rate": 2.0314547837483616e-06, + "loss": 0.8898, + "step": 62 + }, + { + "epoch": 0.0013657139710336473, + "grad_norm": 3.004636764526367, + "learning_rate": 3.0471821756225426e-06, + "loss": 0.8653, + "step": 93 + }, + { + "epoch": 0.0018209519613781964, + "grad_norm": 3.0733370780944824, + "learning_rate": 4.062909567496723e-06, + "loss": 0.8516, + "step": 124 + }, + { + "epoch": 0.0022761899517227454, + "grad_norm": 2.898719549179077, + "learning_rate": 5.078636959370905e-06, + "loss": 0.8461, + "step": 155 + }, + { + "epoch": 0.0027314279420672946, + "grad_norm": 2.8273258209228516, + "learning_rate": 6.094364351245085e-06, + "loss": 0.8362, + "step": 186 + }, + { + "epoch": 0.0031866659324118435, + "grad_norm": 2.9983041286468506, + "learning_rate": 7.110091743119267e-06, + "loss": 0.8274, + "step": 217 + }, + { + "epoch": 0.0036419039227563927, + "grad_norm": 2.9561767578125, + "learning_rate": 8.125819134993446e-06, + "loss": 0.8373, + "step": 248 + }, + { + "epoch": 0.0040971419131009415, + "grad_norm": 2.732726812362671, + "learning_rate": 9.141546526867629e-06, + "loss": 0.8342, + "step": 279 + }, + { + "epoch": 0.004552379903445491, + "grad_norm": 2.8236358165740967, + "learning_rate": 1.015727391874181e-05, + "loss": 0.8387, + "step": 310 + }, + { + "epoch": 0.00500761789379004, + "grad_norm": 2.8730266094207764, + "learning_rate": 1.117300131061599e-05, + "loss": 0.8328, + "step": 341 + }, + { + "epoch": 0.005462855884134589, + "grad_norm": 2.717439651489258, + "learning_rate": 1.218872870249017e-05, + "loss": 0.8316, + "step": 372 + }, + { + "epoch": 0.005918093874479138, + "grad_norm": 2.9625961780548096, + "learning_rate": 1.3204456094364351e-05, + "loss": 0.8295, + "step": 403 + }, + { + "epoch": 0.006373331864823687, + "grad_norm": 2.6649422645568848, + "learning_rate": 1.4220183486238533e-05, + "loss": 0.8356, + "step": 434 + }, + { + "epoch": 0.006828569855168236, + "grad_norm": 2.9066264629364014, + "learning_rate": 1.5235910878112714e-05, + "loss": 0.8313, + "step": 465 + }, + { + "epoch": 0.0072838078455127854, + "grad_norm": 2.7710695266723633, + "learning_rate": 1.6251638269986893e-05, + "loss": 0.8251, + "step": 496 + }, + { + "epoch": 0.007739045835857335, + "grad_norm": 2.416724443435669, + "learning_rate": 1.7267365661861077e-05, + "loss": 0.8319, + "step": 527 + }, + { + "epoch": 0.008194283826201883, + "grad_norm": 2.583005905151367, + "learning_rate": 1.8283093053735257e-05, + "loss": 0.8265, + "step": 558 + }, + { + "epoch": 0.008649521816546433, + "grad_norm": 2.6601760387420654, + "learning_rate": 1.9298820445609438e-05, + "loss": 0.8328, + "step": 589 + }, + { + "epoch": 0.009104759806890982, + "grad_norm": 2.9271795749664307, + "learning_rate": 2.031454783748362e-05, + "loss": 0.8282, + "step": 620 + }, + { + "epoch": 0.00955999779723553, + "grad_norm": 2.3631091117858887, + "learning_rate": 2.13302752293578e-05, + "loss": 0.8314, + "step": 651 + }, + { + "epoch": 0.01001523578758008, + "grad_norm": 2.4002573490142822, + "learning_rate": 2.234600262123198e-05, + "loss": 0.829, + "step": 682 + }, + { + "epoch": 0.010470473777924628, + "grad_norm": 2.629352331161499, + "learning_rate": 2.336173001310616e-05, + "loss": 0.8315, + "step": 713 + }, + { + "epoch": 0.010925711768269179, + "grad_norm": 2.3604094982147217, + "learning_rate": 2.437745740498034e-05, + "loss": 0.8338, + "step": 744 + }, + { + "epoch": 0.011380949758613727, + "grad_norm": 2.8708078861236572, + "learning_rate": 2.5393184796854525e-05, + "loss": 0.843, + "step": 775 + }, + { + "epoch": 0.011836187748958275, + "grad_norm": 2.7403945922851562, + "learning_rate": 2.6408912188728702e-05, + "loss": 0.8372, + "step": 806 + }, + { + "epoch": 0.012291425739302825, + "grad_norm": 2.733816623687744, + "learning_rate": 2.7424639580602886e-05, + "loss": 0.8327, + "step": 837 + }, + { + "epoch": 0.012746663729647374, + "grad_norm": 2.2656960487365723, + "learning_rate": 2.8440366972477066e-05, + "loss": 0.8357, + "step": 868 + }, + { + "epoch": 0.013201901719991924, + "grad_norm": 2.18038010597229, + "learning_rate": 2.9456094364351244e-05, + "loss": 0.8331, + "step": 899 + }, + { + "epoch": 0.013657139710336472, + "grad_norm": 2.497558832168579, + "learning_rate": 3.0471821756225428e-05, + "loss": 0.8271, + "step": 930 + }, + { + "epoch": 0.01411237770068102, + "grad_norm": 2.322632074356079, + "learning_rate": 3.148754914809961e-05, + "loss": 0.8327, + "step": 961 + }, + { + "epoch": 0.014567615691025571, + "grad_norm": 2.5596141815185547, + "learning_rate": 3.2503276539973785e-05, + "loss": 0.8367, + "step": 992 + }, + { + "epoch": 0.01502285368137012, + "grad_norm": 2.1262409687042236, + "learning_rate": 3.351900393184797e-05, + "loss": 0.8192, + "step": 1023 + }, + { + "epoch": 0.01547809167171467, + "grad_norm": 2.2483584880828857, + "learning_rate": 3.453473132372215e-05, + "loss": 0.8352, + "step": 1054 + }, + { + "epoch": 0.015933329662059216, + "grad_norm": 2.224043130874634, + "learning_rate": 3.555045871559633e-05, + "loss": 0.8362, + "step": 1085 + }, + { + "epoch": 0.016388567652403766, + "grad_norm": 2.104788303375244, + "learning_rate": 3.6566186107470514e-05, + "loss": 0.832, + "step": 1116 + }, + { + "epoch": 0.016843805642748316, + "grad_norm": 2.126499652862549, + "learning_rate": 3.7581913499344695e-05, + "loss": 0.8341, + "step": 1147 + }, + { + "epoch": 0.017299043633092866, + "grad_norm": 2.221691370010376, + "learning_rate": 3.8597640891218876e-05, + "loss": 0.8346, + "step": 1178 + }, + { + "epoch": 0.017754281623437413, + "grad_norm": 2.4445159435272217, + "learning_rate": 3.9613368283093056e-05, + "loss": 0.8415, + "step": 1209 + }, + { + "epoch": 0.018209519613781963, + "grad_norm": 2.321516752243042, + "learning_rate": 4.062909567496724e-05, + "loss": 0.8289, + "step": 1240 + }, + { + "epoch": 0.018664757604126513, + "grad_norm": 2.2673656940460205, + "learning_rate": 4.164482306684142e-05, + "loss": 0.8307, + "step": 1271 + }, + { + "epoch": 0.01911999559447106, + "grad_norm": 2.1191513538360596, + "learning_rate": 4.26605504587156e-05, + "loss": 0.8319, + "step": 1302 + }, + { + "epoch": 0.01957523358481561, + "grad_norm": 2.18985652923584, + "learning_rate": 4.367627785058978e-05, + "loss": 0.8398, + "step": 1333 + }, + { + "epoch": 0.02003047157516016, + "grad_norm": 1.9449113607406616, + "learning_rate": 4.469200524246396e-05, + "loss": 0.8304, + "step": 1364 + }, + { + "epoch": 0.020485709565504707, + "grad_norm": 2.0119357109069824, + "learning_rate": 4.570773263433814e-05, + "loss": 0.82, + "step": 1395 + }, + { + "epoch": 0.020940947555849257, + "grad_norm": 2.0013489723205566, + "learning_rate": 4.672346002621232e-05, + "loss": 0.8275, + "step": 1426 + }, + { + "epoch": 0.021396185546193807, + "grad_norm": 2.0175704956054688, + "learning_rate": 4.77391874180865e-05, + "loss": 0.8348, + "step": 1457 + }, + { + "epoch": 0.021851423536538357, + "grad_norm": 2.0555551052093506, + "learning_rate": 4.875491480996068e-05, + "loss": 0.8322, + "step": 1488 + }, + { + "epoch": 0.022306661526882904, + "grad_norm": 1.9871079921722412, + "learning_rate": 4.977064220183487e-05, + "loss": 0.8253, + "step": 1519 + }, + { + "epoch": 0.022761899517227454, + "grad_norm": 2.1991310119628906, + "learning_rate": 4.9999915451558777e-05, + "loss": 0.8252, + "step": 1550 + }, + { + "epoch": 0.023217137507572004, + "grad_norm": 1.96790611743927, + "learning_rate": 4.999955597496219e-05, + "loss": 0.8242, + "step": 1581 + }, + { + "epoch": 0.02367237549791655, + "grad_norm": 2.0572164058685303, + "learning_rate": 4.9998914381774255e-05, + "loss": 0.833, + "step": 1612 + }, + { + "epoch": 0.0241276134882611, + "grad_norm": 2.0381035804748535, + "learning_rate": 4.999799067923527e-05, + "loss": 0.8351, + "step": 1643 + }, + { + "epoch": 0.02458285147860565, + "grad_norm": 2.145235061645508, + "learning_rate": 4.999678487776908e-05, + "loss": 0.842, + "step": 1674 + }, + { + "epoch": 0.025038089468950198, + "grad_norm": 2.0120856761932373, + "learning_rate": 4.9995296990983006e-05, + "loss": 0.838, + "step": 1705 + }, + { + "epoch": 0.025493327459294748, + "grad_norm": 1.9461947679519653, + "learning_rate": 4.999352703566763e-05, + "loss": 0.8269, + "step": 1736 + }, + { + "epoch": 0.025948565449639298, + "grad_norm": 1.9189060926437378, + "learning_rate": 4.999147503179668e-05, + "loss": 0.8214, + "step": 1767 + }, + { + "epoch": 0.026403803439983848, + "grad_norm": 1.906604290008545, + "learning_rate": 4.998914100252672e-05, + "loss": 0.8249, + "step": 1798 + }, + { + "epoch": 0.026859041430328395, + "grad_norm": 2.0810904502868652, + "learning_rate": 4.998652497419696e-05, + "loss": 0.8143, + "step": 1829 + }, + { + "epoch": 0.027314279420672945, + "grad_norm": 1.9607690572738647, + "learning_rate": 4.9983626976328927e-05, + "loss": 0.8155, + "step": 1860 + }, + { + "epoch": 0.027769517411017495, + "grad_norm": 1.8492732048034668, + "learning_rate": 4.998044704162613e-05, + "loss": 0.8168, + "step": 1891 + }, + { + "epoch": 0.02822475540136204, + "grad_norm": 2.0616824626922607, + "learning_rate": 4.9976985205973705e-05, + "loss": 0.8279, + "step": 1922 + }, + { + "epoch": 0.02867999339170659, + "grad_norm": 1.882591724395752, + "learning_rate": 4.997324150843799e-05, + "loss": 0.8199, + "step": 1953 + }, + { + "epoch": 0.029135231382051142, + "grad_norm": 1.8001904487609863, + "learning_rate": 4.99692159912661e-05, + "loss": 0.8232, + "step": 1984 + }, + { + "epoch": 0.02959046937239569, + "grad_norm": 1.94768226146698, + "learning_rate": 4.996490869988546e-05, + "loss": 0.817, + "step": 2015 + }, + { + "epoch": 0.03004570736274024, + "grad_norm": 1.845452904701233, + "learning_rate": 4.996031968290326e-05, + "loss": 0.8146, + "step": 2046 + }, + { + "epoch": 0.03050094535308479, + "grad_norm": 1.8649152517318726, + "learning_rate": 4.995544899210594e-05, + "loss": 0.8246, + "step": 2077 + }, + { + "epoch": 0.03095618334342934, + "grad_norm": 1.665781021118164, + "learning_rate": 4.9950296682458583e-05, + "loss": 0.8004, + "step": 2108 + }, + { + "epoch": 0.031411421333773885, + "grad_norm": 1.6481060981750488, + "learning_rate": 4.994486281210429e-05, + "loss": 0.7949, + "step": 2139 + }, + { + "epoch": 0.03186665932411843, + "grad_norm": 1.7481175661087036, + "learning_rate": 4.9939147442363566e-05, + "loss": 0.8142, + "step": 2170 + }, + { + "epoch": 0.032321897314462986, + "grad_norm": 1.6130069494247437, + "learning_rate": 4.9933150637733574e-05, + "loss": 0.8126, + "step": 2201 + }, + { + "epoch": 0.03277713530480753, + "grad_norm": 1.7536373138427734, + "learning_rate": 4.992687246588743e-05, + "loss": 0.8149, + "step": 2232 + }, + { + "epoch": 0.033232373295152086, + "grad_norm": 1.721200942993164, + "learning_rate": 4.992031299767347e-05, + "loss": 0.8051, + "step": 2263 + }, + { + "epoch": 0.03368761128549663, + "grad_norm": 3.3350446224212646, + "learning_rate": 4.9913472307114386e-05, + "loss": 0.8007, + "step": 2294 + }, + { + "epoch": 0.03414284927584118, + "grad_norm": 1.848281741142273, + "learning_rate": 4.9906350471406446e-05, + "loss": 0.8076, + "step": 2325 + }, + { + "epoch": 0.03459808726618573, + "grad_norm": 1.6283164024353027, + "learning_rate": 4.989894757091861e-05, + "loss": 0.8084, + "step": 2356 + }, + { + "epoch": 0.03505332525653028, + "grad_norm": 1.6567096710205078, + "learning_rate": 4.989126368919158e-05, + "loss": 0.8075, + "step": 2387 + }, + { + "epoch": 0.035508563246874826, + "grad_norm": 1.6488817930221558, + "learning_rate": 4.988329891293693e-05, + "loss": 0.7994, + "step": 2418 + }, + { + "epoch": 0.03596380123721938, + "grad_norm": 1.9465230703353882, + "learning_rate": 4.987505333203608e-05, + "loss": 0.8064, + "step": 2449 + }, + { + "epoch": 0.036419039227563926, + "grad_norm": 1.722517967224121, + "learning_rate": 4.9866527039539276e-05, + "loss": 0.8005, + "step": 2480 + }, + { + "epoch": 0.03687427721790847, + "grad_norm": 1.8509281873703003, + "learning_rate": 4.9857720131664594e-05, + "loss": 0.8094, + "step": 2511 + }, + { + "epoch": 0.03732951520825303, + "grad_norm": 1.7140249013900757, + "learning_rate": 4.9848632707796773e-05, + "loss": 0.8018, + "step": 2542 + }, + { + "epoch": 0.03778475319859757, + "grad_norm": 1.9315160512924194, + "learning_rate": 4.9839264870486155e-05, + "loss": 0.8094, + "step": 2573 + }, + { + "epoch": 0.03823999118894212, + "grad_norm": 1.728063702583313, + "learning_rate": 4.9829616725447526e-05, + "loss": 0.8052, + "step": 2604 + }, + { + "epoch": 0.03869522917928667, + "grad_norm": 1.588205099105835, + "learning_rate": 4.981968838155888e-05, + "loss": 0.7983, + "step": 2635 + }, + { + "epoch": 0.03915046716963122, + "grad_norm": 1.6086421012878418, + "learning_rate": 4.980947995086024e-05, + "loss": 0.7961, + "step": 2666 + }, + { + "epoch": 0.03960570515997577, + "grad_norm": 1.680635929107666, + "learning_rate": 4.979899154855234e-05, + "loss": 0.8033, + "step": 2697 + }, + { + "epoch": 0.04006094315032032, + "grad_norm": 1.6833465099334717, + "learning_rate": 4.9788223292995386e-05, + "loss": 0.793, + "step": 2728 + }, + { + "epoch": 0.04051618114066487, + "grad_norm": 1.5738922357559204, + "learning_rate": 4.977717530570768e-05, + "loss": 0.7837, + "step": 2759 + }, + { + "epoch": 0.040971419131009414, + "grad_norm": 1.6513015031814575, + "learning_rate": 4.976584771136425e-05, + "loss": 0.7969, + "step": 2790 + }, + { + "epoch": 0.04142665712135397, + "grad_norm": 1.7180182933807373, + "learning_rate": 4.975424063779547e-05, + "loss": 0.7994, + "step": 2821 + }, + { + "epoch": 0.041881895111698514, + "grad_norm": 1.5236577987670898, + "learning_rate": 4.974235421598557e-05, + "loss": 0.7952, + "step": 2852 + }, + { + "epoch": 0.04233713310204307, + "grad_norm": 1.760188341140747, + "learning_rate": 4.973018858007122e-05, + "loss": 0.795, + "step": 2883 + }, + { + "epoch": 0.042792371092387614, + "grad_norm": 1.5314075946807861, + "learning_rate": 4.9717743867339963e-05, + "loss": 0.7884, + "step": 2914 + }, + { + "epoch": 0.04324760908273216, + "grad_norm": 1.7256252765655518, + "learning_rate": 4.9705020218228695e-05, + "loss": 0.7883, + "step": 2945 + }, + { + "epoch": 0.043702847073076714, + "grad_norm": 1.6262825727462769, + "learning_rate": 4.969201777632205e-05, + "loss": 0.8007, + "step": 2976 + }, + { + "epoch": 0.04415808506342126, + "grad_norm": 1.4939130544662476, + "learning_rate": 4.9678736688350846e-05, + "loss": 0.7862, + "step": 3007 + }, + { + "epoch": 0.04461332305376581, + "grad_norm": 1.6776609420776367, + "learning_rate": 4.966517710419033e-05, + "loss": 0.7977, + "step": 3038 + }, + { + "epoch": 0.04506856104411036, + "grad_norm": 1.7425099611282349, + "learning_rate": 4.965133917685858e-05, + "loss": 0.789, + "step": 3069 + }, + { + "epoch": 0.04552379903445491, + "grad_norm": 1.5114233493804932, + "learning_rate": 4.9637223062514714e-05, + "loss": 0.7915, + "step": 3100 + }, + { + "epoch": 0.045979037024799455, + "grad_norm": 1.6397335529327393, + "learning_rate": 4.962282892045718e-05, + "loss": 0.8127, + "step": 3131 + }, + { + "epoch": 0.04643427501514401, + "grad_norm": 1.7266038656234741, + "learning_rate": 4.9608156913121904e-05, + "loss": 0.827, + "step": 3162 + }, + { + "epoch": 0.046889513005488555, + "grad_norm": 1.5216429233551025, + "learning_rate": 4.959320720608049e-05, + "loss": 0.8146, + "step": 3193 + }, + { + "epoch": 0.0473447509958331, + "grad_norm": 1.9235813617706299, + "learning_rate": 4.9577979968038354e-05, + "loss": 0.8118, + "step": 3224 + }, + { + "epoch": 0.047799988986177655, + "grad_norm": 1.6768368482589722, + "learning_rate": 4.956247537083282e-05, + "loss": 0.7925, + "step": 3255 + }, + { + "epoch": 0.0482552269765222, + "grad_norm": 1.6357700824737549, + "learning_rate": 4.9546693589431145e-05, + "loss": 0.7743, + "step": 3286 + }, + { + "epoch": 0.04871046496686675, + "grad_norm": 1.6823676824569702, + "learning_rate": 4.9530634801928595e-05, + "loss": 0.7949, + "step": 3317 + }, + { + "epoch": 0.0491657029572113, + "grad_norm": 1.4716497659683228, + "learning_rate": 4.9514299189546395e-05, + "loss": 0.7755, + "step": 3348 + }, + { + "epoch": 0.04962094094755585, + "grad_norm": 3.1948273181915283, + "learning_rate": 4.949768693662973e-05, + "loss": 0.7867, + "step": 3379 + }, + { + "epoch": 0.050076178937900395, + "grad_norm": 1.7412703037261963, + "learning_rate": 4.948079823064559e-05, + "loss": 0.788, + "step": 3410 + }, + { + "epoch": 0.05053141692824495, + "grad_norm": 1.5544873476028442, + "learning_rate": 4.946363326218074e-05, + "loss": 0.7752, + "step": 3441 + }, + { + "epoch": 0.050986654918589495, + "grad_norm": 1.5616456270217896, + "learning_rate": 4.9446192224939525e-05, + "loss": 0.7752, + "step": 3472 + }, + { + "epoch": 0.05144189290893404, + "grad_norm": 1.4451948404312134, + "learning_rate": 4.942847531574167e-05, + "loss": 0.7786, + "step": 3503 + }, + { + "epoch": 0.051897130899278596, + "grad_norm": 1.5473634004592896, + "learning_rate": 4.941048273452008e-05, + "loss": 0.7824, + "step": 3534 + }, + { + "epoch": 0.05235236888962314, + "grad_norm": 1.4772653579711914, + "learning_rate": 4.9392214684318605e-05, + "loss": 0.7873, + "step": 3565 + }, + { + "epoch": 0.052807606879967696, + "grad_norm": 1.4708329439163208, + "learning_rate": 4.93736713712897e-05, + "loss": 0.7851, + "step": 3596 + }, + { + "epoch": 0.05326284487031224, + "grad_norm": 1.469369649887085, + "learning_rate": 4.9354853004692124e-05, + "loss": 0.7874, + "step": 3627 + }, + { + "epoch": 0.05371808286065679, + "grad_norm": 1.3600234985351562, + "learning_rate": 4.93357597968886e-05, + "loss": 0.7792, + "step": 3658 + }, + { + "epoch": 0.05417332085100134, + "grad_norm": 1.6479856967926025, + "learning_rate": 4.931639196334338e-05, + "loss": 0.7886, + "step": 3689 + }, + { + "epoch": 0.05462855884134589, + "grad_norm": 1.5576198101043701, + "learning_rate": 4.9296749722619826e-05, + "loss": 0.7873, + "step": 3720 + }, + { + "epoch": 0.055083796831690436, + "grad_norm": 1.5949997901916504, + "learning_rate": 4.9276833296377966e-05, + "loss": 0.7794, + "step": 3751 + }, + { + "epoch": 0.05553903482203499, + "grad_norm": 1.3033227920532227, + "learning_rate": 4.925664290937196e-05, + "loss": 0.7763, + "step": 3782 + }, + { + "epoch": 0.055994272812379536, + "grad_norm": 1.5083190202713013, + "learning_rate": 4.9236178789447576e-05, + "loss": 0.7796, + "step": 3813 + }, + { + "epoch": 0.05644951080272408, + "grad_norm": 1.4691548347473145, + "learning_rate": 4.921544116753962e-05, + "loss": 0.7744, + "step": 3844 + }, + { + "epoch": 0.05690474879306864, + "grad_norm": 1.4447206258773804, + "learning_rate": 4.919443027766935e-05, + "loss": 0.7812, + "step": 3875 + }, + { + "epoch": 0.05735998678341318, + "grad_norm": 1.4616878032684326, + "learning_rate": 4.91731463569418e-05, + "loss": 0.7824, + "step": 3906 + }, + { + "epoch": 0.05781522477375773, + "grad_norm": 1.4808290004730225, + "learning_rate": 4.915158964554312e-05, + "loss": 0.7777, + "step": 3937 + }, + { + "epoch": 0.058270462764102283, + "grad_norm": 1.4712798595428467, + "learning_rate": 4.912976038673786e-05, + "loss": 0.7801, + "step": 3968 + }, + { + "epoch": 0.05872570075444683, + "grad_norm": 1.4899425506591797, + "learning_rate": 4.9107658826866254e-05, + "loss": 0.7852, + "step": 3999 + }, + { + "epoch": 0.05918093874479138, + "grad_norm": 1.2921983003616333, + "learning_rate": 4.908528521534139e-05, + "loss": 0.7735, + "step": 4030 + }, + { + "epoch": 0.05963617673513593, + "grad_norm": 1.3849194049835205, + "learning_rate": 4.906263980464644e-05, + "loss": 0.7893, + "step": 4061 + }, + { + "epoch": 0.06009141472548048, + "grad_norm": 1.442808747291565, + "learning_rate": 4.903972285033178e-05, + "loss": 0.784, + "step": 4092 + }, + { + "epoch": 0.060546652715825024, + "grad_norm": 1.4925036430358887, + "learning_rate": 4.901653461101213e-05, + "loss": 0.7714, + "step": 4123 + }, + { + "epoch": 0.06100189070616958, + "grad_norm": 1.4630024433135986, + "learning_rate": 4.8993075348363626e-05, + "loss": 0.7677, + "step": 4154 + }, + { + "epoch": 0.061457128696514124, + "grad_norm": 1.344081163406372, + "learning_rate": 4.896934532712084e-05, + "loss": 0.7788, + "step": 4185 + }, + { + "epoch": 0.06191236668685868, + "grad_norm": 1.449648380279541, + "learning_rate": 4.8945344815073846e-05, + "loss": 0.7661, + "step": 4216 + }, + { + "epoch": 0.062367604677203224, + "grad_norm": 1.450361728668213, + "learning_rate": 4.892107408306516e-05, + "loss": 0.7698, + "step": 4247 + }, + { + "epoch": 0.06282284266754777, + "grad_norm": 1.5068191289901733, + "learning_rate": 4.889653340498669e-05, + "loss": 0.7758, + "step": 4278 + }, + { + "epoch": 0.06327808065789232, + "grad_norm": 1.3638824224472046, + "learning_rate": 4.8871723057776664e-05, + "loss": 0.7702, + "step": 4309 + }, + { + "epoch": 0.06373331864823686, + "grad_norm": 1.4669525623321533, + "learning_rate": 4.8846643321416476e-05, + "loss": 0.7755, + "step": 4340 + }, + { + "epoch": 0.06418855663858142, + "grad_norm": 1.502684235572815, + "learning_rate": 4.882129447892753e-05, + "loss": 0.7677, + "step": 4371 + }, + { + "epoch": 0.06464379462892597, + "grad_norm": 1.5910828113555908, + "learning_rate": 4.8795676816368076e-05, + "loss": 0.7646, + "step": 4402 + }, + { + "epoch": 0.06509903261927051, + "grad_norm": 1.4607537984848022, + "learning_rate": 4.876979062282995e-05, + "loss": 0.7684, + "step": 4433 + }, + { + "epoch": 0.06555427060961506, + "grad_norm": 1.3823063373565674, + "learning_rate": 4.8743636190435325e-05, + "loss": 0.7695, + "step": 4464 + }, + { + "epoch": 0.06600950859995962, + "grad_norm": 1.3769381046295166, + "learning_rate": 4.871721381433344e-05, + "loss": 0.7746, + "step": 4495 + }, + { + "epoch": 0.06646474659030417, + "grad_norm": 1.3154246807098389, + "learning_rate": 4.869052379269719e-05, + "loss": 0.7699, + "step": 4526 + }, + { + "epoch": 0.06691998458064871, + "grad_norm": 1.4395617246627808, + "learning_rate": 4.866356642671985e-05, + "loss": 0.7654, + "step": 4557 + }, + { + "epoch": 0.06737522257099327, + "grad_norm": 1.333095908164978, + "learning_rate": 4.8636342020611634e-05, + "loss": 0.7729, + "step": 4588 + }, + { + "epoch": 0.06783046056133782, + "grad_norm": 1.3427667617797852, + "learning_rate": 4.860885088159626e-05, + "loss": 0.7701, + "step": 4619 + }, + { + "epoch": 0.06828569855168236, + "grad_norm": 1.4095017910003662, + "learning_rate": 4.858109331990751e-05, + "loss": 0.7711, + "step": 4650 + }, + { + "epoch": 0.06874093654202691, + "grad_norm": 1.4199680089950562, + "learning_rate": 4.855306964878567e-05, + "loss": 0.7797, + "step": 4681 + }, + { + "epoch": 0.06919617453237147, + "grad_norm": 1.3505098819732666, + "learning_rate": 4.8524780184474084e-05, + "loss": 0.7702, + "step": 4712 + }, + { + "epoch": 0.069651412522716, + "grad_norm": 1.3765984773635864, + "learning_rate": 4.8496225246215496e-05, + "loss": 0.7647, + "step": 4743 + }, + { + "epoch": 0.07010665051306056, + "grad_norm": 1.539703369140625, + "learning_rate": 4.8467405156248505e-05, + "loss": 0.7673, + "step": 4774 + }, + { + "epoch": 0.07056188850340511, + "grad_norm": 1.6388696432113647, + "learning_rate": 4.843832023980392e-05, + "loss": 0.7672, + "step": 4805 + }, + { + "epoch": 0.07101712649374965, + "grad_norm": 1.8714436292648315, + "learning_rate": 4.840897082510106e-05, + "loss": 0.7661, + "step": 4836 + }, + { + "epoch": 0.0714723644840942, + "grad_norm": 1.5175424814224243, + "learning_rate": 4.8379357243344084e-05, + "loss": 0.7709, + "step": 4867 + }, + { + "epoch": 0.07192760247443876, + "grad_norm": 1.3103501796722412, + "learning_rate": 4.8349479828718236e-05, + "loss": 0.7701, + "step": 4898 + }, + { + "epoch": 0.0723828404647833, + "grad_norm": 1.4692376852035522, + "learning_rate": 4.8319338918386075e-05, + "loss": 0.7602, + "step": 4929 + }, + { + "epoch": 0.07283807845512785, + "grad_norm": 1.390416145324707, + "learning_rate": 4.828893485248369e-05, + "loss": 0.7896, + "step": 4960 + }, + { + "epoch": 0.0732933164454724, + "grad_norm": 1.4157830476760864, + "learning_rate": 4.825826797411682e-05, + "loss": 0.7675, + "step": 4991 + }, + { + "epoch": 0.07374855443581695, + "grad_norm": 1.2405292987823486, + "learning_rate": 4.822733862935702e-05, + "loss": 0.7679, + "step": 5022 + }, + { + "epoch": 0.0742037924261615, + "grad_norm": 1.4902536869049072, + "learning_rate": 4.819614716723775e-05, + "loss": 0.7693, + "step": 5053 + }, + { + "epoch": 0.07465903041650605, + "grad_norm": 1.4391045570373535, + "learning_rate": 4.8164693939750425e-05, + "loss": 0.7699, + "step": 5084 + }, + { + "epoch": 0.07511426840685059, + "grad_norm": 1.4432806968688965, + "learning_rate": 4.813297930184042e-05, + "loss": 0.763, + "step": 5115 + }, + { + "epoch": 0.07556950639719515, + "grad_norm": 1.3630146980285645, + "learning_rate": 4.810100361140314e-05, + "loss": 0.7618, + "step": 5146 + }, + { + "epoch": 0.0760247443875397, + "grad_norm": 1.5038788318634033, + "learning_rate": 4.8068767229279885e-05, + "loss": 0.7685, + "step": 5177 + }, + { + "epoch": 0.07647998237788424, + "grad_norm": 1.4100326299667358, + "learning_rate": 4.8036270519253854e-05, + "loss": 0.7645, + "step": 5208 + }, + { + "epoch": 0.0769352203682288, + "grad_norm": 1.377821445465088, + "learning_rate": 4.8003513848046e-05, + "loss": 0.7643, + "step": 5239 + }, + { + "epoch": 0.07739045835857335, + "grad_norm": 1.3848881721496582, + "learning_rate": 4.79704975853109e-05, + "loss": 0.7711, + "step": 5270 + }, + { + "epoch": 0.07784569634891789, + "grad_norm": 1.6327494382858276, + "learning_rate": 4.793722210363262e-05, + "loss": 0.7591, + "step": 5301 + }, + { + "epoch": 0.07830093433926244, + "grad_norm": 1.51273512840271, + "learning_rate": 4.7903687778520414e-05, + "loss": 0.7697, + "step": 5332 + }, + { + "epoch": 0.078756172329607, + "grad_norm": 1.447487235069275, + "learning_rate": 4.7869894988404593e-05, + "loss": 0.7602, + "step": 5363 + }, + { + "epoch": 0.07921141031995153, + "grad_norm": 1.3227125406265259, + "learning_rate": 4.783584411463221e-05, + "loss": 0.7616, + "step": 5394 + }, + { + "epoch": 0.07966664831029609, + "grad_norm": 1.4407910108566284, + "learning_rate": 4.780153554146274e-05, + "loss": 0.7618, + "step": 5425 + }, + { + "epoch": 0.08012188630064064, + "grad_norm": 1.2349289655685425, + "learning_rate": 4.7766969656063766e-05, + "loss": 0.7688, + "step": 5456 + }, + { + "epoch": 0.08057712429098518, + "grad_norm": 1.3858132362365723, + "learning_rate": 4.773214684850662e-05, + "loss": 0.7517, + "step": 5487 + }, + { + "epoch": 0.08103236228132973, + "grad_norm": 1.2951774597167969, + "learning_rate": 4.769706751176193e-05, + "loss": 0.7588, + "step": 5518 + }, + { + "epoch": 0.08148760027167429, + "grad_norm": 1.2295372486114502, + "learning_rate": 4.7661732041695264e-05, + "loss": 0.7522, + "step": 5549 + }, + { + "epoch": 0.08194283826201883, + "grad_norm": 1.4258685111999512, + "learning_rate": 4.762614083706258e-05, + "loss": 0.7666, + "step": 5580 + }, + { + "epoch": 0.08239807625236338, + "grad_norm": 1.5323545932769775, + "learning_rate": 4.759029429950581e-05, + "loss": 0.7587, + "step": 5611 + }, + { + "epoch": 0.08285331424270793, + "grad_norm": 1.3712198734283447, + "learning_rate": 4.7554192833548235e-05, + "loss": 0.7597, + "step": 5642 + }, + { + "epoch": 0.08330855223305247, + "grad_norm": 1.389145016670227, + "learning_rate": 4.751783684659e-05, + "loss": 0.7484, + "step": 5673 + }, + { + "epoch": 0.08376379022339703, + "grad_norm": 1.466799020767212, + "learning_rate": 4.748122674890348e-05, + "loss": 0.7586, + "step": 5704 + }, + { + "epoch": 0.08421902821374158, + "grad_norm": 1.4522569179534912, + "learning_rate": 4.7444362953628654e-05, + "loss": 0.7615, + "step": 5735 + }, + { + "epoch": 0.08467426620408613, + "grad_norm": 1.6374691724777222, + "learning_rate": 4.7407245876768424e-05, + "loss": 0.7573, + "step": 5766 + }, + { + "epoch": 0.08512950419443067, + "grad_norm": 1.384748101234436, + "learning_rate": 4.736987593718397e-05, + "loss": 0.7516, + "step": 5797 + }, + { + "epoch": 0.08558474218477523, + "grad_norm": 1.366335153579712, + "learning_rate": 4.733225355658999e-05, + "loss": 0.7681, + "step": 5828 + }, + { + "epoch": 0.08603998017511978, + "grad_norm": 1.3855708837509155, + "learning_rate": 4.7294379159549926e-05, + "loss": 0.7557, + "step": 5859 + }, + { + "epoch": 0.08649521816546432, + "grad_norm": 1.3614453077316284, + "learning_rate": 4.725625317347119e-05, + "loss": 0.7624, + "step": 5890 + }, + { + "epoch": 0.08695045615580888, + "grad_norm": 1.2072994709014893, + "learning_rate": 4.7217876028600374e-05, + "loss": 0.7545, + "step": 5921 + }, + { + "epoch": 0.08740569414615343, + "grad_norm": 1.5530472993850708, + "learning_rate": 4.717924815801832e-05, + "loss": 0.7574, + "step": 5952 + }, + { + "epoch": 0.08786093213649797, + "grad_norm": 1.3522703647613525, + "learning_rate": 4.714036999763532e-05, + "loss": 0.7608, + "step": 5983 + }, + { + "epoch": 0.08831617012684252, + "grad_norm": 1.342637300491333, + "learning_rate": 4.7101241986186116e-05, + "loss": 0.7519, + "step": 6014 + }, + { + "epoch": 0.08877140811718708, + "grad_norm": 1.2783684730529785, + "learning_rate": 4.7061864565225e-05, + "loss": 0.7565, + "step": 6045 + }, + { + "epoch": 0.08922664610753162, + "grad_norm": 1.1245291233062744, + "learning_rate": 4.702223817912081e-05, + "loss": 0.7655, + "step": 6076 + }, + { + "epoch": 0.08968188409787617, + "grad_norm": 1.2777146100997925, + "learning_rate": 4.698236327505195e-05, + "loss": 0.7526, + "step": 6107 + }, + { + "epoch": 0.09013712208822072, + "grad_norm": 1.2739795446395874, + "learning_rate": 4.694224030300127e-05, + "loss": 0.7594, + "step": 6138 + }, + { + "epoch": 0.09059236007856526, + "grad_norm": 1.358684778213501, + "learning_rate": 4.690186971575107e-05, + "loss": 0.7599, + "step": 6169 + }, + { + "epoch": 0.09104759806890982, + "grad_norm": 1.2908906936645508, + "learning_rate": 4.6861251968877916e-05, + "loss": 0.7564, + "step": 6200 + }, + { + "epoch": 0.09150283605925437, + "grad_norm": 1.2198729515075684, + "learning_rate": 4.68203875207476e-05, + "loss": 0.7595, + "step": 6231 + }, + { + "epoch": 0.09195807404959891, + "grad_norm": 1.337134599685669, + "learning_rate": 4.677927683250983e-05, + "loss": 0.7512, + "step": 6262 + }, + { + "epoch": 0.09241331203994346, + "grad_norm": 1.4034923315048218, + "learning_rate": 4.6737920368093156e-05, + "loss": 0.7587, + "step": 6293 + }, + { + "epoch": 0.09286855003028802, + "grad_norm": 1.2584336996078491, + "learning_rate": 4.669631859419965e-05, + "loss": 0.7532, + "step": 6324 + }, + { + "epoch": 0.09332378802063256, + "grad_norm": 1.4327120780944824, + "learning_rate": 4.6654471980299676e-05, + "loss": 0.7528, + "step": 6355 + }, + { + "epoch": 0.09377902601097711, + "grad_norm": 1.386634111404419, + "learning_rate": 4.661238099862658e-05, + "loss": 0.7455, + "step": 6386 + }, + { + "epoch": 0.09423426400132166, + "grad_norm": 1.373677372932434, + "learning_rate": 4.657004612417138e-05, + "loss": 0.7519, + "step": 6417 + }, + { + "epoch": 0.0946895019916662, + "grad_norm": 1.3846200704574585, + "learning_rate": 4.6527467834677374e-05, + "loss": 0.7505, + "step": 6448 + }, + { + "epoch": 0.09514473998201076, + "grad_norm": 1.3237924575805664, + "learning_rate": 4.648464661063478e-05, + "loss": 0.7562, + "step": 6479 + }, + { + "epoch": 0.09559997797235531, + "grad_norm": 1.3368539810180664, + "learning_rate": 4.6441582935275264e-05, + "loss": 0.7523, + "step": 6510 + }, + { + "epoch": 0.09605521596269985, + "grad_norm": 1.1844121217727661, + "learning_rate": 4.6398277294566586e-05, + "loss": 0.7535, + "step": 6541 + }, + { + "epoch": 0.0965104539530444, + "grad_norm": 1.3107521533966064, + "learning_rate": 4.6354730177207e-05, + "loss": 0.7609, + "step": 6572 + }, + { + "epoch": 0.09696569194338896, + "grad_norm": 1.3678165674209595, + "learning_rate": 4.6310942074619787e-05, + "loss": 0.7513, + "step": 6603 + }, + { + "epoch": 0.0974209299337335, + "grad_norm": 1.2169839143753052, + "learning_rate": 4.626691348094777e-05, + "loss": 0.7464, + "step": 6634 + }, + { + "epoch": 0.09787616792407805, + "grad_norm": 1.5456466674804688, + "learning_rate": 4.622264489304762e-05, + "loss": 0.7431, + "step": 6665 + }, + { + "epoch": 0.0983314059144226, + "grad_norm": 1.3606510162353516, + "learning_rate": 4.617813681048434e-05, + "loss": 0.7481, + "step": 6696 + }, + { + "epoch": 0.09878664390476714, + "grad_norm": 1.1854835748672485, + "learning_rate": 4.61333897355256e-05, + "loss": 0.7491, + "step": 6727 + }, + { + "epoch": 0.0992418818951117, + "grad_norm": 1.1682158708572388, + "learning_rate": 4.608840417313604e-05, + "loss": 0.7433, + "step": 6758 + }, + { + "epoch": 0.09969711988545625, + "grad_norm": 1.3889473676681519, + "learning_rate": 4.6043180630971646e-05, + "loss": 0.7414, + "step": 6789 + }, + { + "epoch": 0.10015235787580079, + "grad_norm": 1.351807713508606, + "learning_rate": 4.599771961937391e-05, + "loss": 0.7501, + "step": 6820 + }, + { + "epoch": 0.10060759586614534, + "grad_norm": 1.4737526178359985, + "learning_rate": 4.5952021651364204e-05, + "loss": 0.7508, + "step": 6851 + }, + { + "epoch": 0.1010628338564899, + "grad_norm": 1.3223670721054077, + "learning_rate": 4.590608724263786e-05, + "loss": 0.7557, + "step": 6882 + }, + { + "epoch": 0.10151807184683444, + "grad_norm": 1.182531714439392, + "learning_rate": 4.585991691155845e-05, + "loss": 0.7595, + "step": 6913 + }, + { + "epoch": 0.10197330983717899, + "grad_norm": 1.3309706449508667, + "learning_rate": 4.581351117915188e-05, + "loss": 0.7516, + "step": 6944 + }, + { + "epoch": 0.10242854782752354, + "grad_norm": 1.1631003618240356, + "learning_rate": 4.5766870569100534e-05, + "loss": 0.7538, + "step": 6975 + }, + { + "epoch": 0.10288378581786808, + "grad_norm": 1.150242805480957, + "learning_rate": 4.571999560773736e-05, + "loss": 0.7583, + "step": 7006 + }, + { + "epoch": 0.10333902380821264, + "grad_norm": 1.2031753063201904, + "learning_rate": 4.5672886824039915e-05, + "loss": 0.743, + "step": 7037 + }, + { + "epoch": 0.10379426179855719, + "grad_norm": 1.3973835706710815, + "learning_rate": 4.5625544749624435e-05, + "loss": 0.7477, + "step": 7068 + }, + { + "epoch": 0.10424949978890174, + "grad_norm": 1.3755607604980469, + "learning_rate": 4.5577969918739794e-05, + "loss": 0.7424, + "step": 7099 + }, + { + "epoch": 0.10470473777924628, + "grad_norm": 1.2959696054458618, + "learning_rate": 4.5530162868261486e-05, + "loss": 0.7432, + "step": 7130 + }, + { + "epoch": 0.10515997576959084, + "grad_norm": 1.4843213558197021, + "learning_rate": 4.548212413768558e-05, + "loss": 0.7519, + "step": 7161 + }, + { + "epoch": 0.10561521375993539, + "grad_norm": 1.3697874546051025, + "learning_rate": 4.543385426912261e-05, + "loss": 0.7546, + "step": 7192 + }, + { + "epoch": 0.10607045175027993, + "grad_norm": 1.3163423538208008, + "learning_rate": 4.53853538072915e-05, + "loss": 0.7328, + "step": 7223 + }, + { + "epoch": 0.10652568974062449, + "grad_norm": 1.200897455215454, + "learning_rate": 4.533662329951336e-05, + "loss": 0.7438, + "step": 7254 + }, + { + "epoch": 0.10698092773096904, + "grad_norm": 1.3377361297607422, + "learning_rate": 4.528766329570536e-05, + "loss": 0.7478, + "step": 7285 + }, + { + "epoch": 0.10743616572131358, + "grad_norm": 1.2304787635803223, + "learning_rate": 4.523847434837447e-05, + "loss": 0.7468, + "step": 7316 + }, + { + "epoch": 0.10789140371165813, + "grad_norm": 1.1947201490402222, + "learning_rate": 4.518905701261128e-05, + "loss": 0.7483, + "step": 7347 + }, + { + "epoch": 0.10834664170200269, + "grad_norm": 1.1942620277404785, + "learning_rate": 4.5139411846083715e-05, + "loss": 0.7522, + "step": 7378 + }, + { + "epoch": 0.10880187969234723, + "grad_norm": 1.2765998840332031, + "learning_rate": 4.508953940903073e-05, + "loss": 0.7425, + "step": 7409 + }, + { + "epoch": 0.10925711768269178, + "grad_norm": 1.1460707187652588, + "learning_rate": 4.5039440264255994e-05, + "loss": 0.7534, + "step": 7440 + }, + { + "epoch": 0.10971235567303633, + "grad_norm": 1.1739740371704102, + "learning_rate": 4.498911497712155e-05, + "loss": 0.7447, + "step": 7471 + }, + { + "epoch": 0.11016759366338087, + "grad_norm": 1.227232575416565, + "learning_rate": 4.493856411554142e-05, + "loss": 0.7452, + "step": 7502 + }, + { + "epoch": 0.11062283165372543, + "grad_norm": 1.3812710046768188, + "learning_rate": 4.4887788249975206e-05, + "loss": 0.7431, + "step": 7533 + }, + { + "epoch": 0.11107806964406998, + "grad_norm": 1.1743565797805786, + "learning_rate": 4.4836787953421656e-05, + "loss": 0.7421, + "step": 7564 + }, + { + "epoch": 0.11153330763441452, + "grad_norm": 1.2436307668685913, + "learning_rate": 4.478556380141218e-05, + "loss": 0.7461, + "step": 7595 + }, + { + "epoch": 0.11198854562475907, + "grad_norm": 1.2472264766693115, + "learning_rate": 4.4734116372004375e-05, + "loss": 0.7456, + "step": 7626 + }, + { + "epoch": 0.11244378361510363, + "grad_norm": 1.2209473848342896, + "learning_rate": 4.4682446245775477e-05, + "loss": 0.7544, + "step": 7657 + }, + { + "epoch": 0.11289902160544817, + "grad_norm": 1.3582866191864014, + "learning_rate": 4.463055400581586e-05, + "loss": 0.7464, + "step": 7688 + }, + { + "epoch": 0.11335425959579272, + "grad_norm": 1.2489479780197144, + "learning_rate": 4.4578440237722374e-05, + "loss": 0.7394, + "step": 7719 + }, + { + "epoch": 0.11380949758613727, + "grad_norm": 1.2384752035140991, + "learning_rate": 4.452610552959183e-05, + "loss": 0.7358, + "step": 7750 + }, + { + "epoch": 0.11426473557648181, + "grad_norm": 1.3618046045303345, + "learning_rate": 4.447355047201428e-05, + "loss": 0.742, + "step": 7781 + }, + { + "epoch": 0.11471997356682637, + "grad_norm": 1.2598398923873901, + "learning_rate": 4.4420775658066414e-05, + "loss": 0.7407, + "step": 7812 + }, + { + "epoch": 0.11517521155717092, + "grad_norm": 1.1453444957733154, + "learning_rate": 4.436778168330484e-05, + "loss": 0.7499, + "step": 7843 + }, + { + "epoch": 0.11563044954751546, + "grad_norm": 1.3463783264160156, + "learning_rate": 4.4314569145759353e-05, + "loss": 0.7388, + "step": 7874 + }, + { + "epoch": 0.11608568753786001, + "grad_norm": 1.2431267499923706, + "learning_rate": 4.42611386459262e-05, + "loss": 0.7487, + "step": 7905 + }, + { + "epoch": 0.11654092552820457, + "grad_norm": 1.2726207971572876, + "learning_rate": 4.420749078676133e-05, + "loss": 0.7468, + "step": 7936 + }, + { + "epoch": 0.1169961635185491, + "grad_norm": 1.1979873180389404, + "learning_rate": 4.4153626173673516e-05, + "loss": 0.7482, + "step": 7967 + }, + { + "epoch": 0.11745140150889366, + "grad_norm": 1.2321964502334595, + "learning_rate": 4.409954541451762e-05, + "loss": 0.7363, + "step": 7998 + }, + { + "epoch": 0.11790663949923821, + "grad_norm": 1.265199065208435, + "learning_rate": 4.404524911958764e-05, + "loss": 0.7466, + "step": 8029 + }, + { + "epoch": 0.11836187748958275, + "grad_norm": 1.3418115377426147, + "learning_rate": 4.399073790160989e-05, + "loss": 0.7433, + "step": 8060 + }, + { + "epoch": 0.11881711547992731, + "grad_norm": 1.276666522026062, + "learning_rate": 4.393601237573607e-05, + "loss": 0.7457, + "step": 8091 + }, + { + "epoch": 0.11927235347027186, + "grad_norm": 1.3559373617172241, + "learning_rate": 4.388107315953628e-05, + "loss": 0.7451, + "step": 8122 + }, + { + "epoch": 0.1197275914606164, + "grad_norm": 1.3017044067382812, + "learning_rate": 4.382592087299212e-05, + "loss": 0.736, + "step": 8153 + }, + { + "epoch": 0.12018282945096095, + "grad_norm": 1.230873703956604, + "learning_rate": 4.377055613848964e-05, + "loss": 0.7409, + "step": 8184 + }, + { + "epoch": 0.12063806744130551, + "grad_norm": 1.3415979146957397, + "learning_rate": 4.3714979580812355e-05, + "loss": 0.7456, + "step": 8215 + }, + { + "epoch": 0.12109330543165005, + "grad_norm": 1.3706104755401611, + "learning_rate": 4.365919182713416e-05, + "loss": 0.7425, + "step": 8246 + }, + { + "epoch": 0.1215485434219946, + "grad_norm": 1.3071256875991821, + "learning_rate": 4.360319350701226e-05, + "loss": 0.7312, + "step": 8277 + }, + { + "epoch": 0.12200378141233915, + "grad_norm": 1.1925092935562134, + "learning_rate": 4.3546985252380115e-05, + "loss": 0.7457, + "step": 8308 + }, + { + "epoch": 0.12245901940268371, + "grad_norm": 1.2254852056503296, + "learning_rate": 4.349056769754021e-05, + "loss": 0.7454, + "step": 8339 + }, + { + "epoch": 0.12291425739302825, + "grad_norm": 1.2121927738189697, + "learning_rate": 4.3433941479156994e-05, + "loss": 0.7451, + "step": 8370 + }, + { + "epoch": 0.1233694953833728, + "grad_norm": 1.4518349170684814, + "learning_rate": 4.3377107236249647e-05, + "loss": 0.7376, + "step": 8401 + }, + { + "epoch": 0.12382473337371736, + "grad_norm": 1.2487318515777588, + "learning_rate": 4.332006561018488e-05, + "loss": 0.7468, + "step": 8432 + }, + { + "epoch": 0.1242799713640619, + "grad_norm": 1.2601664066314697, + "learning_rate": 4.3262817244669683e-05, + "loss": 0.7341, + "step": 8463 + }, + { + "epoch": 0.12473520935440645, + "grad_norm": 2.2060904502868652, + "learning_rate": 4.3205362785744083e-05, + "loss": 0.7421, + "step": 8494 + }, + { + "epoch": 0.125190447344751, + "grad_norm": 1.2388501167297363, + "learning_rate": 4.314770288177384e-05, + "loss": 0.7372, + "step": 8525 + }, + { + "epoch": 0.12564568533509554, + "grad_norm": 1.20314359664917, + "learning_rate": 4.308983818344313e-05, + "loss": 0.7308, + "step": 8556 + }, + { + "epoch": 0.1261009233254401, + "grad_norm": 1.2387797832489014, + "learning_rate": 4.3031769343747206e-05, + "loss": 0.7364, + "step": 8587 + }, + { + "epoch": 0.12655616131578465, + "grad_norm": 1.2150596380233765, + "learning_rate": 4.297349701798505e-05, + "loss": 0.7339, + "step": 8618 + }, + { + "epoch": 0.1270113993061292, + "grad_norm": 1.3160388469696045, + "learning_rate": 4.2915021863751916e-05, + "loss": 0.7448, + "step": 8649 + }, + { + "epoch": 0.12746663729647373, + "grad_norm": 1.2065149545669556, + "learning_rate": 4.285634454093198e-05, + "loss": 0.7373, + "step": 8680 + }, + { + "epoch": 0.12792187528681828, + "grad_norm": 1.2958154678344727, + "learning_rate": 4.279746571169086e-05, + "loss": 0.7282, + "step": 8711 + }, + { + "epoch": 0.12837711327716284, + "grad_norm": 1.2723522186279297, + "learning_rate": 4.2738386040468136e-05, + "loss": 0.7274, + "step": 8742 + }, + { + "epoch": 0.1288323512675074, + "grad_norm": 1.1709808111190796, + "learning_rate": 4.2679106193969866e-05, + "loss": 0.7459, + "step": 8773 + }, + { + "epoch": 0.12928758925785194, + "grad_norm": 1.3450511693954468, + "learning_rate": 4.261962684116106e-05, + "loss": 0.7334, + "step": 8804 + }, + { + "epoch": 0.1297428272481965, + "grad_norm": 1.3592984676361084, + "learning_rate": 4.2559948653258145e-05, + "loss": 0.7415, + "step": 8835 + }, + { + "epoch": 0.13019806523854102, + "grad_norm": 1.2315089702606201, + "learning_rate": 4.250007230372134e-05, + "loss": 0.7296, + "step": 8866 + }, + { + "epoch": 0.13065330322888558, + "grad_norm": 1.0693110227584839, + "learning_rate": 4.2439998468247126e-05, + "loss": 0.7269, + "step": 8897 + }, + { + "epoch": 0.13110854121923013, + "grad_norm": 1.0341922044754028, + "learning_rate": 4.2379727824760566e-05, + "loss": 0.7257, + "step": 8928 + }, + { + "epoch": 0.13156377920957468, + "grad_norm": 1.3500678539276123, + "learning_rate": 4.231926105340768e-05, + "loss": 0.7379, + "step": 8959 + }, + { + "epoch": 0.13201901719991924, + "grad_norm": 1.3482787609100342, + "learning_rate": 4.225859883654776e-05, + "loss": 0.7393, + "step": 8990 + }, + { + "epoch": 0.1324742551902638, + "grad_norm": 1.3126060962677002, + "learning_rate": 4.219774185874569e-05, + "loss": 0.7391, + "step": 9021 + }, + { + "epoch": 0.13292949318060834, + "grad_norm": 1.2438606023788452, + "learning_rate": 4.213669080676418e-05, + "loss": 0.7347, + "step": 9052 + }, + { + "epoch": 0.13338473117095287, + "grad_norm": 1.3756070137023926, + "learning_rate": 4.2075446369556056e-05, + "loss": 0.7321, + "step": 9083 + }, + { + "epoch": 0.13383996916129742, + "grad_norm": 1.189840316772461, + "learning_rate": 4.201400923825648e-05, + "loss": 0.7352, + "step": 9114 + }, + { + "epoch": 0.13429520715164198, + "grad_norm": 1.1982426643371582, + "learning_rate": 4.195238010617511e-05, + "loss": 0.7388, + "step": 9145 + }, + { + "epoch": 0.13475044514198653, + "grad_norm": 1.226491928100586, + "learning_rate": 4.1890559668788344e-05, + "loss": 0.7349, + "step": 9176 + }, + { + "epoch": 0.13520568313233108, + "grad_norm": 1.3030654191970825, + "learning_rate": 4.1828548623731405e-05, + "loss": 0.7371, + "step": 9207 + }, + { + "epoch": 0.13566092112267564, + "grad_norm": 1.17802894115448, + "learning_rate": 4.1766347670790506e-05, + "loss": 0.7431, + "step": 9238 + }, + { + "epoch": 0.13611615911302016, + "grad_norm": 1.2893658876419067, + "learning_rate": 4.170395751189495e-05, + "loss": 0.7258, + "step": 9269 + }, + { + "epoch": 0.13657139710336472, + "grad_norm": 1.193758487701416, + "learning_rate": 4.164137885110921e-05, + "loss": 0.7362, + "step": 9300 + }, + { + "epoch": 0.13702663509370927, + "grad_norm": 1.2750978469848633, + "learning_rate": 4.157861239462495e-05, + "loss": 0.7265, + "step": 9331 + }, + { + "epoch": 0.13748187308405382, + "grad_norm": 1.161464810371399, + "learning_rate": 4.1515658850753114e-05, + "loss": 0.7389, + "step": 9362 + }, + { + "epoch": 0.13793711107439838, + "grad_norm": 1.2749390602111816, + "learning_rate": 4.145251892991588e-05, + "loss": 0.7315, + "step": 9393 + }, + { + "epoch": 0.13839234906474293, + "grad_norm": 1.1729388236999512, + "learning_rate": 4.138919334463868e-05, + "loss": 0.7352, + "step": 9424 + }, + { + "epoch": 0.13884758705508746, + "grad_norm": 1.0987122058868408, + "learning_rate": 4.1325682809542124e-05, + "loss": 0.7347, + "step": 9455 + }, + { + "epoch": 0.139302825045432, + "grad_norm": 1.0855766534805298, + "learning_rate": 4.126198804133398e-05, + "loss": 0.7271, + "step": 9486 + }, + { + "epoch": 0.13975806303577656, + "grad_norm": 1.1782172918319702, + "learning_rate": 4.1198109758801055e-05, + "loss": 0.74, + "step": 9517 + }, + { + "epoch": 0.14021330102612112, + "grad_norm": 1.186962366104126, + "learning_rate": 4.113404868280107e-05, + "loss": 0.725, + "step": 9548 + }, + { + "epoch": 0.14066853901646567, + "grad_norm": 1.2993977069854736, + "learning_rate": 4.106980553625457e-05, + "loss": 0.7346, + "step": 9579 + }, + { + "epoch": 0.14112377700681022, + "grad_norm": 1.154897928237915, + "learning_rate": 4.100538104413674e-05, + "loss": 0.7348, + "step": 9610 + }, + { + "epoch": 0.14157901499715475, + "grad_norm": 1.203722357749939, + "learning_rate": 4.09407759334692e-05, + "loss": 0.7312, + "step": 9641 + }, + { + "epoch": 0.1420342529874993, + "grad_norm": 1.102858543395996, + "learning_rate": 4.087599093331186e-05, + "loss": 0.7227, + "step": 9672 + }, + { + "epoch": 0.14248949097784386, + "grad_norm": 1.2667406797409058, + "learning_rate": 4.081102677475462e-05, + "loss": 0.727, + "step": 9703 + }, + { + "epoch": 0.1429447289681884, + "grad_norm": 1.221291422843933, + "learning_rate": 4.0745884190909194e-05, + "loss": 0.7317, + "step": 9734 + }, + { + "epoch": 0.14339996695853297, + "grad_norm": 1.0426162481307983, + "learning_rate": 4.0680563916900796e-05, + "loss": 0.7299, + "step": 9765 + }, + { + "epoch": 0.14385520494887752, + "grad_norm": 1.2158912420272827, + "learning_rate": 4.0615066689859815e-05, + "loss": 0.7399, + "step": 9796 + }, + { + "epoch": 0.14431044293922204, + "grad_norm": 1.2689307928085327, + "learning_rate": 4.0549393248913584e-05, + "loss": 0.721, + "step": 9827 + }, + { + "epoch": 0.1447656809295666, + "grad_norm": 1.3168833255767822, + "learning_rate": 4.048354433517794e-05, + "loss": 0.7258, + "step": 9858 + }, + { + "epoch": 0.14522091891991115, + "grad_norm": 1.1966122388839722, + "learning_rate": 4.0417520691748916e-05, + "loss": 0.7375, + "step": 9889 + }, + { + "epoch": 0.1456761569102557, + "grad_norm": 1.1252835988998413, + "learning_rate": 4.035132306369438e-05, + "loss": 0.7254, + "step": 9920 + }, + { + "epoch": 0.14613139490060026, + "grad_norm": 1.186324119567871, + "learning_rate": 4.028495219804555e-05, + "loss": 0.7337, + "step": 9951 + }, + { + "epoch": 0.1465866328909448, + "grad_norm": 1.1606496572494507, + "learning_rate": 4.021840884378864e-05, + "loss": 0.7273, + "step": 9982 + }, + { + "epoch": 0.14704187088128934, + "grad_norm": 1.316298007965088, + "learning_rate": 4.015169375185633e-05, + "loss": 0.7276, + "step": 10013 + }, + { + "epoch": 0.1474971088716339, + "grad_norm": 1.224236249923706, + "learning_rate": 4.0084807675119396e-05, + "loss": 0.7328, + "step": 10044 + }, + { + "epoch": 0.14795234686197845, + "grad_norm": 1.227148175239563, + "learning_rate": 4.0017751368378106e-05, + "loss": 0.739, + "step": 10075 + }, + { + "epoch": 0.148407584852323, + "grad_norm": 1.1307867765426636, + "learning_rate": 3.995052558835377e-05, + "loss": 0.7324, + "step": 10106 + }, + { + "epoch": 0.14886282284266755, + "grad_norm": 1.2655375003814697, + "learning_rate": 3.988313109368017e-05, + "loss": 0.7263, + "step": 10137 + }, + { + "epoch": 0.1493180608330121, + "grad_norm": 1.1724634170532227, + "learning_rate": 3.981556864489504e-05, + "loss": 0.7289, + "step": 10168 + }, + { + "epoch": 0.14977329882335666, + "grad_norm": 1.140684723854065, + "learning_rate": 3.974783900443142e-05, + "loss": 0.7309, + "step": 10199 + }, + { + "epoch": 0.15022853681370119, + "grad_norm": 1.167183756828308, + "learning_rate": 3.9679942936609095e-05, + "loss": 0.729, + "step": 10230 + }, + { + "epoch": 0.15068377480404574, + "grad_norm": 1.2216546535491943, + "learning_rate": 3.961188120762596e-05, + "loss": 0.7288, + "step": 10261 + }, + { + "epoch": 0.1511390127943903, + "grad_norm": 1.227397084236145, + "learning_rate": 3.954365458554938e-05, + "loss": 0.7313, + "step": 10292 + }, + { + "epoch": 0.15159425078473485, + "grad_norm": 1.075441837310791, + "learning_rate": 3.947526384030751e-05, + "loss": 0.7306, + "step": 10323 + }, + { + "epoch": 0.1520494887750794, + "grad_norm": 1.1227167844772339, + "learning_rate": 3.9406709743680624e-05, + "loss": 0.7287, + "step": 10354 + }, + { + "epoch": 0.15250472676542395, + "grad_norm": 1.0665740966796875, + "learning_rate": 3.9337993069292366e-05, + "loss": 0.7246, + "step": 10385 + }, + { + "epoch": 0.15295996475576848, + "grad_norm": 1.250580072402954, + "learning_rate": 3.926911459260109e-05, + "loss": 0.7183, + "step": 10416 + }, + { + "epoch": 0.15341520274611303, + "grad_norm": 1.1809351444244385, + "learning_rate": 3.920007509089102e-05, + "loss": 0.7307, + "step": 10447 + }, + { + "epoch": 0.1538704407364576, + "grad_norm": 1.3934037685394287, + "learning_rate": 3.913087534326357e-05, + "loss": 0.7151, + "step": 10478 + }, + { + "epoch": 0.15432567872680214, + "grad_norm": 1.1272441148757935, + "learning_rate": 3.9061516130628475e-05, + "loss": 0.7309, + "step": 10509 + }, + { + "epoch": 0.1547809167171467, + "grad_norm": 1.2410191297531128, + "learning_rate": 3.8991998235695025e-05, + "loss": 0.7375, + "step": 10540 + }, + { + "epoch": 0.15523615470749125, + "grad_norm": 1.1426646709442139, + "learning_rate": 3.8922322442963224e-05, + "loss": 0.7342, + "step": 10571 + }, + { + "epoch": 0.15569139269783577, + "grad_norm": 1.2037526369094849, + "learning_rate": 3.885248953871491e-05, + "loss": 0.7329, + "step": 10602 + }, + { + "epoch": 0.15614663068818033, + "grad_norm": 1.2198660373687744, + "learning_rate": 3.8782500311004915e-05, + "loss": 0.7367, + "step": 10633 + }, + { + "epoch": 0.15660186867852488, + "grad_norm": 1.3129019737243652, + "learning_rate": 3.871235554965218e-05, + "loss": 0.7236, + "step": 10664 + }, + { + "epoch": 0.15705710666886943, + "grad_norm": 1.2182697057724, + "learning_rate": 3.864205604623078e-05, + "loss": 0.721, + "step": 10695 + }, + { + "epoch": 0.157512344659214, + "grad_norm": 1.1494821310043335, + "learning_rate": 3.857160259406107e-05, + "loss": 0.7223, + "step": 10726 + }, + { + "epoch": 0.15796758264955854, + "grad_norm": 1.0680466890335083, + "learning_rate": 3.8500995988200674e-05, + "loss": 0.7224, + "step": 10757 + }, + { + "epoch": 0.15842282063990307, + "grad_norm": 1.155052661895752, + "learning_rate": 3.843023702543556e-05, + "loss": 0.7268, + "step": 10788 + }, + { + "epoch": 0.15887805863024762, + "grad_norm": 1.1415716409683228, + "learning_rate": 3.8359326504270984e-05, + "loss": 0.7167, + "step": 10819 + }, + { + "epoch": 0.15933329662059217, + "grad_norm": 1.1486759185791016, + "learning_rate": 3.828826522492255e-05, + "loss": 0.7236, + "step": 10850 + }, + { + "epoch": 0.15978853461093673, + "grad_norm": 1.1916829347610474, + "learning_rate": 3.821705398930713e-05, + "loss": 0.7269, + "step": 10881 + }, + { + "epoch": 0.16024377260128128, + "grad_norm": 1.180760383605957, + "learning_rate": 3.814569360103385e-05, + "loss": 0.724, + "step": 10912 + }, + { + "epoch": 0.16069901059162583, + "grad_norm": 1.197651982307434, + "learning_rate": 3.807418486539499e-05, + "loss": 0.7265, + "step": 10943 + }, + { + "epoch": 0.16115424858197036, + "grad_norm": 1.1579455137252808, + "learning_rate": 3.80025285893569e-05, + "loss": 0.7247, + "step": 10974 + }, + { + "epoch": 0.16160948657231491, + "grad_norm": 1.1810616254806519, + "learning_rate": 3.793072558155093e-05, + "loss": 0.7256, + "step": 11005 + }, + { + "epoch": 0.16206472456265947, + "grad_norm": 1.1510332822799683, + "learning_rate": 3.785877665226426e-05, + "loss": 0.722, + "step": 11036 + }, + { + "epoch": 0.16251996255300402, + "grad_norm": 1.0499261617660522, + "learning_rate": 3.778668261343079e-05, + "loss": 0.7255, + "step": 11067 + }, + { + "epoch": 0.16297520054334858, + "grad_norm": 1.1509817838668823, + "learning_rate": 3.771444427862192e-05, + "loss": 0.7147, + "step": 11098 + }, + { + "epoch": 0.16343043853369313, + "grad_norm": 1.1638833284378052, + "learning_rate": 3.7642062463037465e-05, + "loss": 0.7187, + "step": 11129 + }, + { + "epoch": 0.16388567652403765, + "grad_norm": 1.024095058441162, + "learning_rate": 3.7569537983496373e-05, + "loss": 0.7244, + "step": 11160 + }, + { + "epoch": 0.1643409145143822, + "grad_norm": 1.1326078176498413, + "learning_rate": 3.749687165842753e-05, + "loss": 0.7221, + "step": 11191 + }, + { + "epoch": 0.16479615250472676, + "grad_norm": 1.1863468885421753, + "learning_rate": 3.7424064307860536e-05, + "loss": 0.725, + "step": 11222 + }, + { + "epoch": 0.16525139049507132, + "grad_norm": 1.2948099374771118, + "learning_rate": 3.735111675341645e-05, + "loss": 0.7249, + "step": 11253 + }, + { + "epoch": 0.16570662848541587, + "grad_norm": 1.1384631395339966, + "learning_rate": 3.7278029818298524e-05, + "loss": 0.7203, + "step": 11284 + }, + { + "epoch": 0.16616186647576042, + "grad_norm": 1.1335748434066772, + "learning_rate": 3.720480432728287e-05, + "loss": 0.7183, + "step": 11315 + }, + { + "epoch": 0.16661710446610495, + "grad_norm": 1.2514820098876953, + "learning_rate": 3.71314411067092e-05, + "loss": 0.7216, + "step": 11346 + }, + { + "epoch": 0.1670723424564495, + "grad_norm": 1.128739833831787, + "learning_rate": 3.70579409844715e-05, + "loss": 0.7234, + "step": 11377 + }, + { + "epoch": 0.16752758044679406, + "grad_norm": 1.152342677116394, + "learning_rate": 3.698430479000865e-05, + "loss": 0.7304, + "step": 11408 + }, + { + "epoch": 0.1679828184371386, + "grad_norm": 1.222424864768982, + "learning_rate": 3.691053335429509e-05, + "loss": 0.7164, + "step": 11439 + }, + { + "epoch": 0.16843805642748316, + "grad_norm": 1.2406268119812012, + "learning_rate": 3.683662750983147e-05, + "loss": 0.7157, + "step": 11470 + }, + { + "epoch": 0.16889329441782772, + "grad_norm": 1.1266759634017944, + "learning_rate": 3.676258809063518e-05, + "loss": 0.7228, + "step": 11501 + }, + { + "epoch": 0.16934853240817227, + "grad_norm": 1.1239662170410156, + "learning_rate": 3.6688415932231004e-05, + "loss": 0.7266, + "step": 11532 + }, + { + "epoch": 0.1698037703985168, + "grad_norm": 1.0813047885894775, + "learning_rate": 3.661411187164166e-05, + "loss": 0.7278, + "step": 11563 + }, + { + "epoch": 0.17025900838886135, + "grad_norm": 56.909889221191406, + "learning_rate": 3.65396767473784e-05, + "loss": 0.7163, + "step": 11594 + }, + { + "epoch": 0.1707142463792059, + "grad_norm": 1.0749647617340088, + "learning_rate": 3.6465111399431465e-05, + "loss": 0.7182, + "step": 11625 + }, + { + "epoch": 0.17116948436955046, + "grad_norm": 1.0705510377883911, + "learning_rate": 3.6390416669260674e-05, + "loss": 0.726, + "step": 11656 + }, + { + "epoch": 0.171624722359895, + "grad_norm": 1.1083126068115234, + "learning_rate": 3.63155933997859e-05, + "loss": 0.7262, + "step": 11687 + }, + { + "epoch": 0.17207996035023956, + "grad_norm": 1.2302770614624023, + "learning_rate": 3.624064243537758e-05, + "loss": 0.7229, + "step": 11718 + }, + { + "epoch": 0.1725351983405841, + "grad_norm": 1.240893006324768, + "learning_rate": 3.616556462184716e-05, + "loss": 0.7228, + "step": 11749 + }, + { + "epoch": 0.17299043633092864, + "grad_norm": 1.1897794008255005, + "learning_rate": 3.609036080643755e-05, + "loss": 0.7178, + "step": 11780 + }, + { + "epoch": 0.1734456743212732, + "grad_norm": 1.244950294494629, + "learning_rate": 3.60150318378136e-05, + "loss": 0.7244, + "step": 11811 + }, + { + "epoch": 0.17390091231161775, + "grad_norm": 1.1689528226852417, + "learning_rate": 3.5939578566052465e-05, + "loss": 0.7181, + "step": 11842 + }, + { + "epoch": 0.1743561503019623, + "grad_norm": 1.1300387382507324, + "learning_rate": 3.586400184263408e-05, + "loss": 0.7176, + "step": 11873 + }, + { + "epoch": 0.17481138829230686, + "grad_norm": 1.0884469747543335, + "learning_rate": 3.578830252043148e-05, + "loss": 0.7182, + "step": 11904 + }, + { + "epoch": 0.17526662628265138, + "grad_norm": 1.119352102279663, + "learning_rate": 3.571248145370125e-05, + "loss": 0.7223, + "step": 11935 + }, + { + "epoch": 0.17572186427299594, + "grad_norm": 1.0979810953140259, + "learning_rate": 3.5636539498073794e-05, + "loss": 0.7233, + "step": 11966 + }, + { + "epoch": 0.1761771022633405, + "grad_norm": 1.138571858406067, + "learning_rate": 3.556047751054378e-05, + "loss": 0.7279, + "step": 11997 + }, + { + "epoch": 0.17663234025368504, + "grad_norm": 1.2011150121688843, + "learning_rate": 3.548429634946039e-05, + "loss": 0.717, + "step": 12028 + }, + { + "epoch": 0.1770875782440296, + "grad_norm": 1.119472861289978, + "learning_rate": 3.540799687451768e-05, + "loss": 0.7227, + "step": 12059 + }, + { + "epoch": 0.17754281623437415, + "grad_norm": 1.089117169380188, + "learning_rate": 3.533157994674485e-05, + "loss": 0.7132, + "step": 12090 + }, + { + "epoch": 0.17799805422471868, + "grad_norm": 1.0937649011611938, + "learning_rate": 3.5255046428496546e-05, + "loss": 0.7285, + "step": 12121 + }, + { + "epoch": 0.17845329221506323, + "grad_norm": 1.0359808206558228, + "learning_rate": 3.517839718344311e-05, + "loss": 0.7178, + "step": 12152 + }, + { + "epoch": 0.17890853020540778, + "grad_norm": 1.1309690475463867, + "learning_rate": 3.510163307656086e-05, + "loss": 0.713, + "step": 12183 + }, + { + "epoch": 0.17936376819575234, + "grad_norm": 1.126597285270691, + "learning_rate": 3.5024754974122324e-05, + "loss": 0.7139, + "step": 12214 + }, + { + "epoch": 0.1798190061860969, + "grad_norm": 1.29429292678833, + "learning_rate": 3.494776374368643e-05, + "loss": 0.7261, + "step": 12245 + }, + { + "epoch": 0.18027424417644144, + "grad_norm": 1.1156132221221924, + "learning_rate": 3.4870660254088724e-05, + "loss": 0.7124, + "step": 12276 + }, + { + "epoch": 0.18072948216678597, + "grad_norm": 1.1548200845718384, + "learning_rate": 3.479344537543164e-05, + "loss": 0.7195, + "step": 12307 + }, + { + "epoch": 0.18118472015713052, + "grad_norm": 1.1318944692611694, + "learning_rate": 3.4716119979074565e-05, + "loss": 0.7215, + "step": 12338 + }, + { + "epoch": 0.18163995814747508, + "grad_norm": 1.1952061653137207, + "learning_rate": 3.463868493762412e-05, + "loss": 0.7159, + "step": 12369 + }, + { + "epoch": 0.18209519613781963, + "grad_norm": 1.1703433990478516, + "learning_rate": 3.456114112492418e-05, + "loss": 0.7145, + "step": 12400 + }, + { + "epoch": 0.18255043412816419, + "grad_norm": 1.065319538116455, + "learning_rate": 3.4483489416046164e-05, + "loss": 0.7108, + "step": 12431 + }, + { + "epoch": 0.18300567211850874, + "grad_norm": 1.278533697128296, + "learning_rate": 3.440573068727905e-05, + "loss": 0.7116, + "step": 12462 + }, + { + "epoch": 0.18346091010885326, + "grad_norm": 1.1642391681671143, + "learning_rate": 3.4327865816119495e-05, + "loss": 0.716, + "step": 12493 + }, + { + "epoch": 0.18391614809919782, + "grad_norm": 1.2108798027038574, + "learning_rate": 3.4249895681262025e-05, + "loss": 0.7194, + "step": 12524 + }, + { + "epoch": 0.18437138608954237, + "grad_norm": 1.085163950920105, + "learning_rate": 3.417182116258899e-05, + "loss": 0.7166, + "step": 12555 + }, + { + "epoch": 0.18482662407988693, + "grad_norm": 1.124064564704895, + "learning_rate": 3.409364314116074e-05, + "loss": 0.7218, + "step": 12586 + }, + { + "epoch": 0.18528186207023148, + "grad_norm": 1.1817457675933838, + "learning_rate": 3.401536249920559e-05, + "loss": 0.7161, + "step": 12617 + }, + { + "epoch": 0.18573710006057603, + "grad_norm": 1.1430065631866455, + "learning_rate": 3.393698012010998e-05, + "loss": 0.7223, + "step": 12648 + }, + { + "epoch": 0.18619233805092056, + "grad_norm": 1.151633620262146, + "learning_rate": 3.385849688840839e-05, + "loss": 0.7181, + "step": 12679 + }, + { + "epoch": 0.1866475760412651, + "grad_norm": 1.1847879886627197, + "learning_rate": 3.3779913689773414e-05, + "loss": 0.7144, + "step": 12710 + }, + { + "epoch": 0.18710281403160967, + "grad_norm": 1.0997257232666016, + "learning_rate": 3.370123141100578e-05, + "loss": 0.7179, + "step": 12741 + }, + { + "epoch": 0.18755805202195422, + "grad_norm": 1.3207893371582031, + "learning_rate": 3.3622450940024305e-05, + "loss": 0.7147, + "step": 12772 + }, + { + "epoch": 0.18801329001229877, + "grad_norm": 1.1414912939071655, + "learning_rate": 3.35435731658559e-05, + "loss": 0.7258, + "step": 12803 + }, + { + "epoch": 0.18846852800264333, + "grad_norm": 1.1708139181137085, + "learning_rate": 3.346459897862552e-05, + "loss": 0.7222, + "step": 12834 + }, + { + "epoch": 0.18892376599298788, + "grad_norm": 1.0472559928894043, + "learning_rate": 3.338552926954613e-05, + "loss": 0.7168, + "step": 12865 + }, + { + "epoch": 0.1893790039833324, + "grad_norm": 1.1117762327194214, + "learning_rate": 3.330636493090868e-05, + "loss": 0.7128, + "step": 12896 + }, + { + "epoch": 0.18983424197367696, + "grad_norm": 1.1351251602172852, + "learning_rate": 3.322710685607193e-05, + "loss": 0.72, + "step": 12927 + }, + { + "epoch": 0.1902894799640215, + "grad_norm": 1.1530694961547852, + "learning_rate": 3.314775593945251e-05, + "loss": 0.7133, + "step": 12958 + }, + { + "epoch": 0.19074471795436607, + "grad_norm": 1.1590766906738281, + "learning_rate": 3.3068313076514714e-05, + "loss": 0.7236, + "step": 12989 + }, + { + "epoch": 0.19119995594471062, + "grad_norm": 1.0657463073730469, + "learning_rate": 3.298877916376047e-05, + "loss": 0.7159, + "step": 13020 + }, + { + "epoch": 0.19165519393505517, + "grad_norm": 1.090234637260437, + "learning_rate": 3.290915509871915e-05, + "loss": 0.7249, + "step": 13051 + }, + { + "epoch": 0.1921104319253997, + "grad_norm": 1.1001083850860596, + "learning_rate": 3.282944177993753e-05, + "loss": 0.721, + "step": 13082 + }, + { + "epoch": 0.19256566991574425, + "grad_norm": 1.0616897344589233, + "learning_rate": 3.274964010696957e-05, + "loss": 0.7145, + "step": 13113 + }, + { + "epoch": 0.1930209079060888, + "grad_norm": 1.05698823928833, + "learning_rate": 3.266975098036629e-05, + "loss": 0.7106, + "step": 13144 + }, + { + "epoch": 0.19347614589643336, + "grad_norm": 1.0747356414794922, + "learning_rate": 3.258977530166562e-05, + "loss": 0.7199, + "step": 13175 + }, + { + "epoch": 0.1939313838867779, + "grad_norm": 1.112629771232605, + "learning_rate": 3.250971397338227e-05, + "loss": 0.7088, + "step": 13206 + }, + { + "epoch": 0.19438662187712247, + "grad_norm": 1.13507878780365, + "learning_rate": 3.2429567898997404e-05, + "loss": 0.7084, + "step": 13237 + }, + { + "epoch": 0.194841859867467, + "grad_norm": 1.137793779373169, + "learning_rate": 3.234933798294859e-05, + "loss": 0.7121, + "step": 13268 + }, + { + "epoch": 0.19529709785781155, + "grad_norm": 1.263027548789978, + "learning_rate": 3.2269025130619535e-05, + "loss": 0.7117, + "step": 13299 + }, + { + "epoch": 0.1957523358481561, + "grad_norm": 1.1799430847167969, + "learning_rate": 3.218863024832985e-05, + "loss": 0.7144, + "step": 13330 + }, + { + "epoch": 0.19620757383850065, + "grad_norm": 1.040297508239746, + "learning_rate": 3.2108154243324864e-05, + "loss": 0.7089, + "step": 13361 + }, + { + "epoch": 0.1966628118288452, + "grad_norm": 1.110816240310669, + "learning_rate": 3.2027598023765345e-05, + "loss": 0.7166, + "step": 13392 + }, + { + "epoch": 0.19711804981918976, + "grad_norm": 1.0399518013000488, + "learning_rate": 3.194696249871729e-05, + "loss": 0.7081, + "step": 13423 + }, + { + "epoch": 0.1975732878095343, + "grad_norm": 1.1946238279342651, + "learning_rate": 3.186624857814164e-05, + "loss": 0.7112, + "step": 13454 + }, + { + "epoch": 0.19802852579987884, + "grad_norm": 1.1289162635803223, + "learning_rate": 3.178545717288401e-05, + "loss": 0.7079, + "step": 13485 + }, + { + "epoch": 0.1984837637902234, + "grad_norm": 1.17491614818573, + "learning_rate": 3.170458919466444e-05, + "loss": 0.7117, + "step": 13516 + }, + { + "epoch": 0.19893900178056795, + "grad_norm": 1.1498847007751465, + "learning_rate": 3.1623645556067063e-05, + "loss": 0.7131, + "step": 13547 + }, + { + "epoch": 0.1993942397709125, + "grad_norm": 1.2516326904296875, + "learning_rate": 3.154262717052985e-05, + "loss": 0.7145, + "step": 13578 + }, + { + "epoch": 0.19984947776125705, + "grad_norm": 1.2112692594528198, + "learning_rate": 3.146153495233426e-05, + "loss": 0.7098, + "step": 13609 + }, + { + "epoch": 0.20030471575160158, + "grad_norm": 0.989539384841919, + "learning_rate": 3.1380369816594944e-05, + "loss": 0.7182, + "step": 13640 + }, + { + "epoch": 0.20075995374194613, + "grad_norm": 1.0849697589874268, + "learning_rate": 3.129913267924946e-05, + "loss": 0.7153, + "step": 13671 + }, + { + "epoch": 0.2012151917322907, + "grad_norm": 1.1133675575256348, + "learning_rate": 3.121782445704782e-05, + "loss": 0.7104, + "step": 13702 + }, + { + "epoch": 0.20167042972263524, + "grad_norm": 1.1086009740829468, + "learning_rate": 3.11364460675423e-05, + "loss": 0.7125, + "step": 13733 + }, + { + "epoch": 0.2021256677129798, + "grad_norm": 1.0160905122756958, + "learning_rate": 3.1054998429076934e-05, + "loss": 0.7055, + "step": 13764 + }, + { + "epoch": 0.20258090570332435, + "grad_norm": 1.0811957120895386, + "learning_rate": 3.097348246077728e-05, + "loss": 0.7105, + "step": 13795 + }, + { + "epoch": 0.20303614369366887, + "grad_norm": 1.0012198686599731, + "learning_rate": 3.0891899082539924e-05, + "loss": 0.7134, + "step": 13826 + }, + { + "epoch": 0.20349138168401343, + "grad_norm": 1.2298318147659302, + "learning_rate": 3.0810249215022233e-05, + "loss": 0.7054, + "step": 13857 + }, + { + "epoch": 0.20394661967435798, + "grad_norm": 1.1324481964111328, + "learning_rate": 3.0728533779631865e-05, + "loss": 0.7023, + "step": 13888 + }, + { + "epoch": 0.20440185766470254, + "grad_norm": 1.1454377174377441, + "learning_rate": 3.064675369851637e-05, + "loss": 0.7124, + "step": 13919 + }, + { + "epoch": 0.2048570956550471, + "grad_norm": 1.0461289882659912, + "learning_rate": 3.056490989455289e-05, + "loss": 0.7159, + "step": 13950 + }, + { + "epoch": 0.20531233364539164, + "grad_norm": 1.0020164251327515, + "learning_rate": 3.0483003291337596e-05, + "loss": 0.702, + "step": 13981 + }, + { + "epoch": 0.20576757163573617, + "grad_norm": 1.0952621698379517, + "learning_rate": 3.040103481317539e-05, + "loss": 0.7108, + "step": 14012 + }, + { + "epoch": 0.20622280962608072, + "grad_norm": 1.129813551902771, + "learning_rate": 3.03190053850694e-05, + "loss": 0.7043, + "step": 14043 + }, + { + "epoch": 0.20667804761642528, + "grad_norm": 1.012137770652771, + "learning_rate": 3.0236915932710573e-05, + "loss": 0.7002, + "step": 14074 + }, + { + "epoch": 0.20713328560676983, + "grad_norm": 1.1774550676345825, + "learning_rate": 3.0154767382467232e-05, + "loss": 0.6996, + "step": 14105 + }, + { + "epoch": 0.20758852359711438, + "grad_norm": 1.0878807306289673, + "learning_rate": 3.0072560661374582e-05, + "loss": 0.7075, + "step": 14136 + }, + { + "epoch": 0.20804376158745894, + "grad_norm": 1.1599992513656616, + "learning_rate": 2.999029669712431e-05, + "loss": 0.7165, + "step": 14167 + }, + { + "epoch": 0.2084989995778035, + "grad_norm": 1.184403419494629, + "learning_rate": 2.990797641805408e-05, + "loss": 0.709, + "step": 14198 + }, + { + "epoch": 0.20895423756814802, + "grad_norm": 1.0857961177825928, + "learning_rate": 2.982560075313704e-05, + "loss": 0.6986, + "step": 14229 + }, + { + "epoch": 0.20940947555849257, + "grad_norm": 1.0922378301620483, + "learning_rate": 2.9743170631971368e-05, + "loss": 0.7085, + "step": 14260 + }, + { + "epoch": 0.20986471354883712, + "grad_norm": 1.0843058824539185, + "learning_rate": 2.9660686984769792e-05, + "loss": 0.7114, + "step": 14291 + }, + { + "epoch": 0.21031995153918168, + "grad_norm": 1.2273341417312622, + "learning_rate": 2.9578150742349047e-05, + "loss": 0.7092, + "step": 14322 + }, + { + "epoch": 0.21077518952952623, + "grad_norm": 1.1041830778121948, + "learning_rate": 2.949556283611942e-05, + "loss": 0.7086, + "step": 14353 + }, + { + "epoch": 0.21123042751987078, + "grad_norm": 1.1019948720932007, + "learning_rate": 2.9412924198074206e-05, + "loss": 0.7095, + "step": 14384 + }, + { + "epoch": 0.2116856655102153, + "grad_norm": 1.1595929861068726, + "learning_rate": 2.9330235760779208e-05, + "loss": 0.7083, + "step": 14415 + }, + { + "epoch": 0.21214090350055986, + "grad_norm": 1.252021312713623, + "learning_rate": 2.9247498457362188e-05, + "loss": 0.7045, + "step": 14446 + }, + { + "epoch": 0.21259614149090442, + "grad_norm": 1.1469565629959106, + "learning_rate": 2.9164713221502373e-05, + "loss": 0.6995, + "step": 14477 + }, + { + "epoch": 0.21305137948124897, + "grad_norm": 1.1796189546585083, + "learning_rate": 2.9081880987419912e-05, + "loss": 0.7101, + "step": 14508 + }, + { + "epoch": 0.21350661747159352, + "grad_norm": 1.183271884918213, + "learning_rate": 2.8999002689865296e-05, + "loss": 0.7074, + "step": 14539 + }, + { + "epoch": 0.21396185546193808, + "grad_norm": 1.2489229440689087, + "learning_rate": 2.8916079264108852e-05, + "loss": 0.7126, + "step": 14570 + }, + { + "epoch": 0.2144170934522826, + "grad_norm": 1.059584379196167, + "learning_rate": 2.883311164593017e-05, + "loss": 0.7045, + "step": 14601 + }, + { + "epoch": 0.21487233144262716, + "grad_norm": 1.1565430164337158, + "learning_rate": 2.875010077160754e-05, + "loss": 0.7075, + "step": 14632 + }, + { + "epoch": 0.2153275694329717, + "grad_norm": 1.0368664264678955, + "learning_rate": 2.866704757790741e-05, + "loss": 0.7065, + "step": 14663 + }, + { + "epoch": 0.21578280742331626, + "grad_norm": 1.1395366191864014, + "learning_rate": 2.858395300207376e-05, + "loss": 0.7029, + "step": 14694 + }, + { + "epoch": 0.21623804541366082, + "grad_norm": 1.077816128730774, + "learning_rate": 2.8500817981817607e-05, + "loss": 0.7078, + "step": 14725 + }, + { + "epoch": 0.21669328340400537, + "grad_norm": 1.2020843029022217, + "learning_rate": 2.8417643455306336e-05, + "loss": 0.7011, + "step": 14756 + }, + { + "epoch": 0.2171485213943499, + "grad_norm": 1.0293958187103271, + "learning_rate": 2.8334430361153185e-05, + "loss": 0.7038, + "step": 14787 + }, + { + "epoch": 0.21760375938469445, + "grad_norm": 1.2163504362106323, + "learning_rate": 2.8251179638406612e-05, + "loss": 0.7029, + "step": 14818 + }, + { + "epoch": 0.218058997375039, + "grad_norm": 1.0222349166870117, + "learning_rate": 2.8167892226539704e-05, + "loss": 0.7084, + "step": 14849 + }, + { + "epoch": 0.21851423536538356, + "grad_norm": 1.0532034635543823, + "learning_rate": 2.8084569065439588e-05, + "loss": 0.7162, + "step": 14880 + }, + { + "epoch": 0.2189694733557281, + "grad_norm": 0.9981489181518555, + "learning_rate": 2.8001211095396807e-05, + "loss": 0.7088, + "step": 14911 + }, + { + "epoch": 0.21942471134607267, + "grad_norm": 1.0804964303970337, + "learning_rate": 2.791781925709473e-05, + "loss": 0.7035, + "step": 14942 + }, + { + "epoch": 0.2198799493364172, + "grad_norm": 1.1326287984848022, + "learning_rate": 2.7834394491598908e-05, + "loss": 0.708, + "step": 14973 + }, + { + "epoch": 0.22033518732676174, + "grad_norm": 1.1164413690567017, + "learning_rate": 2.7750937740346485e-05, + "loss": 0.7047, + "step": 15004 + }, + { + "epoch": 0.2207904253171063, + "grad_norm": 1.1455166339874268, + "learning_rate": 2.7667449945135564e-05, + "loss": 0.708, + "step": 15035 + }, + { + "epoch": 0.22124566330745085, + "grad_norm": 1.1329716444015503, + "learning_rate": 2.7583932048114557e-05, + "loss": 0.6923, + "step": 15066 + }, + { + "epoch": 0.2217009012977954, + "grad_norm": 1.3479692935943604, + "learning_rate": 2.7500384991771587e-05, + "loss": 0.7119, + "step": 15097 + }, + { + "epoch": 0.22215613928813996, + "grad_norm": 1.1160681247711182, + "learning_rate": 2.7416809718923825e-05, + "loss": 0.7031, + "step": 15128 + }, + { + "epoch": 0.22261137727848448, + "grad_norm": 1.1188467741012573, + "learning_rate": 2.7333207172706864e-05, + "loss": 0.7029, + "step": 15159 + }, + { + "epoch": 0.22306661526882904, + "grad_norm": 0.9910608530044556, + "learning_rate": 2.7249578296564088e-05, + "loss": 0.6996, + "step": 15190 + }, + { + "epoch": 0.2235218532591736, + "grad_norm": 1.100176453590393, + "learning_rate": 2.7165924034235973e-05, + "loss": 0.7027, + "step": 15221 + }, + { + "epoch": 0.22397709124951815, + "grad_norm": 1.1848573684692383, + "learning_rate": 2.708224532974953e-05, + "loss": 0.7111, + "step": 15252 + } + ], + "logging_steps": 31, + "max_steps": 30517, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 3052, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.1258280394232955e+19, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-15260/training_args.bin b/checkpoint-15260/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..dd0c30645908fd59ad9d6091c5e2e08018856b49 --- /dev/null +++ b/checkpoint-15260/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2d2ae676cb4770c8405bd6fc6962f1ef1289d669b20872c9a947682a2673db22 +size 5304 diff --git a/checkpoint-18312/config.json b/checkpoint-18312/config.json new file mode 100644 index 0000000000000000000000000000000000000000..1a2e378a7091e0c0c8eaa10d0bd7796835fe4b64 --- /dev/null +++ b/checkpoint-18312/config.json @@ -0,0 +1,39 @@ +{ + "_name_or_path": "meta-llama/Llama-3.1-8B-Instruct", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 8.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "torch_dtype": "float32", + "transformers_version": "4.44.0.dev0", + "use_cache": true, + "vocab_size": 128256 +} diff --git a/checkpoint-18312/generation_config.json b/checkpoint-18312/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c0d952a3697e90a24460d766a494a8bb9cf80534 --- /dev/null +++ b/checkpoint-18312/generation_config.json @@ -0,0 +1,12 @@ +{ + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.44.0.dev0" +} diff --git a/checkpoint-18312/model-00001-of-00007.safetensors b/checkpoint-18312/model-00001-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..c4be852913c6f31b839178fb421557e281ba308c --- /dev/null +++ b/checkpoint-18312/model-00001-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:15b59e2f0f68ed5cc4cf50dc986bcae4af0b50ac1c5b437494588e81b8427445 +size 4886466168 diff --git a/checkpoint-18312/model-00002-of-00007.safetensors b/checkpoint-18312/model-00002-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..fbc8e97904ed5acdba6240901a52d7ea3c73628d --- /dev/null +++ b/checkpoint-18312/model-00002-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:23fa5aaf8ac56c17c821980c9aee44fcbc2a617fc570e875e6fdb93d92886c72 +size 4832007448 diff --git a/checkpoint-18312/model-00003-of-00007.safetensors b/checkpoint-18312/model-00003-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..aeab8b855338147792249baf9c680bfb29ea42a5 --- /dev/null +++ b/checkpoint-18312/model-00003-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a223e87240e50dfb05634ea909cf1ec715d37a12b681b14a3a6e97f28688f2b +size 4999813112 diff --git a/checkpoint-18312/model-00004-of-00007.safetensors b/checkpoint-18312/model-00004-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ac9e7b6b5edbe63eec2dd4c7d3310db1f10fdda9 --- /dev/null +++ b/checkpoint-18312/model-00004-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3047d199c611c6d12b1cca780f033ad89df9c076f742674f7ac4a892ca9573b3 +size 4999813128 diff --git a/checkpoint-18312/model-00005-of-00007.safetensors b/checkpoint-18312/model-00005-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..48718503a952853ffd5e231b9dba74b7026e0e01 --- /dev/null +++ b/checkpoint-18312/model-00005-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a2f91d749d664ecd22c71f3d54488966fc6899a03924b8fea3613c8f83398a8 +size 4832007496 diff --git a/checkpoint-18312/model-00006-of-00007.safetensors b/checkpoint-18312/model-00006-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..4ab42ab17299c23e4ee9f4df32110d2059a49853 --- /dev/null +++ b/checkpoint-18312/model-00006-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:675cc9c3e5214625e998d62d595e208a2cf4468407179e291eebea96052431bf +size 4999813120 diff --git a/checkpoint-18312/model-00007-of-00007.safetensors b/checkpoint-18312/model-00007-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..14d0da8d5211e7e6fd54d75033e12b9c61cdacce --- /dev/null +++ b/checkpoint-18312/model-00007-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1912930245fc72f4f2d5017b0b8971fa3ce16bac3bd44362105ed61a31028e30 +size 2571158184 diff --git a/checkpoint-18312/model.safetensors.index.json b/checkpoint-18312/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..0985084320f65a678bea622edb50d3771af4fd13 --- /dev/null +++ b/checkpoint-18312/model.safetensors.index.json @@ -0,0 +1,298 @@ +{ + "metadata": { + "total_size": 32121044992 + }, + "weight_map": { + "lm_head.weight": "model-00007-of-00007.safetensors", + "model.embed_tokens.weight": "model-00001-of-00007.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.10.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.15.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.20.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.21.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.26.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.3.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.30.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.input_layernorm.weight": "model-00007-of-00007.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00007-of-00007.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00007-of-00007.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00007-of-00007.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.4.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.9.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.norm.weight": "model-00007-of-00007.safetensors" + } +} diff --git a/checkpoint-18312/optimizer.pt b/checkpoint-18312/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..9a83310250de4dec864e7584c2d10c80ff10f749 --- /dev/null +++ b/checkpoint-18312/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:608915729c51a97bb28565c2cad078a86cf06eb2f804d31d71bdc09a6216cdfd +size 15385036334 diff --git a/checkpoint-18312/rng_state.pth b/checkpoint-18312/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..897ad19948758b40c785c678c136c4029433a27a --- /dev/null +++ b/checkpoint-18312/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d9cd6a0487226e5bd30d1846894c82af483733ab4381b75bae9c0745e05d405 +size 14244 diff --git a/checkpoint-18312/scheduler.pt b/checkpoint-18312/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..59a8b46d1ac64fc3cd4c673b6051786fee3ed26d --- /dev/null +++ b/checkpoint-18312/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a0e65c3d6f29e706fd941a38280ce5628189a6998eac6d29abbeab00ad838d00 +size 1064 diff --git a/checkpoint-18312/trainer_state.json b/checkpoint-18312/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..2b2d2e457462c51b524e89f9e96429ab98bc5410 --- /dev/null +++ b/checkpoint-18312/trainer_state.json @@ -0,0 +1,4163 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.268913486425464, + "eval_steps": 500, + "global_step": 18312, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0004552379903445491, + "grad_norm": 4.0020222663879395, + "learning_rate": 1.0157273918741808e-06, + "loss": 0.92, + "step": 31 + }, + { + "epoch": 0.0009104759806890982, + "grad_norm": 3.06746506690979, + "learning_rate": 2.0314547837483616e-06, + "loss": 0.8898, + "step": 62 + }, + { + "epoch": 0.0013657139710336473, + "grad_norm": 3.004636764526367, + "learning_rate": 3.0471821756225426e-06, + "loss": 0.8653, + "step": 93 + }, + { + "epoch": 0.0018209519613781964, + "grad_norm": 3.0733370780944824, + "learning_rate": 4.062909567496723e-06, + "loss": 0.8516, + "step": 124 + }, + { + "epoch": 0.0022761899517227454, + "grad_norm": 2.898719549179077, + "learning_rate": 5.078636959370905e-06, + "loss": 0.8461, + "step": 155 + }, + { + "epoch": 0.0027314279420672946, + "grad_norm": 2.8273258209228516, + "learning_rate": 6.094364351245085e-06, + "loss": 0.8362, + "step": 186 + }, + { + "epoch": 0.0031866659324118435, + "grad_norm": 2.9983041286468506, + "learning_rate": 7.110091743119267e-06, + "loss": 0.8274, + "step": 217 + }, + { + "epoch": 0.0036419039227563927, + "grad_norm": 2.9561767578125, + "learning_rate": 8.125819134993446e-06, + "loss": 0.8373, + "step": 248 + }, + { + "epoch": 0.0040971419131009415, + "grad_norm": 2.732726812362671, + "learning_rate": 9.141546526867629e-06, + "loss": 0.8342, + "step": 279 + }, + { + "epoch": 0.004552379903445491, + "grad_norm": 2.8236358165740967, + "learning_rate": 1.015727391874181e-05, + "loss": 0.8387, + "step": 310 + }, + { + "epoch": 0.00500761789379004, + "grad_norm": 2.8730266094207764, + "learning_rate": 1.117300131061599e-05, + "loss": 0.8328, + "step": 341 + }, + { + "epoch": 0.005462855884134589, + "grad_norm": 2.717439651489258, + "learning_rate": 1.218872870249017e-05, + "loss": 0.8316, + "step": 372 + }, + { + "epoch": 0.005918093874479138, + "grad_norm": 2.9625961780548096, + "learning_rate": 1.3204456094364351e-05, + "loss": 0.8295, + "step": 403 + }, + { + "epoch": 0.006373331864823687, + "grad_norm": 2.6649422645568848, + "learning_rate": 1.4220183486238533e-05, + "loss": 0.8356, + "step": 434 + }, + { + "epoch": 0.006828569855168236, + "grad_norm": 2.9066264629364014, + "learning_rate": 1.5235910878112714e-05, + "loss": 0.8313, + "step": 465 + }, + { + "epoch": 0.0072838078455127854, + "grad_norm": 2.7710695266723633, + "learning_rate": 1.6251638269986893e-05, + "loss": 0.8251, + "step": 496 + }, + { + "epoch": 0.007739045835857335, + "grad_norm": 2.416724443435669, + "learning_rate": 1.7267365661861077e-05, + "loss": 0.8319, + "step": 527 + }, + { + "epoch": 0.008194283826201883, + "grad_norm": 2.583005905151367, + "learning_rate": 1.8283093053735257e-05, + "loss": 0.8265, + "step": 558 + }, + { + "epoch": 0.008649521816546433, + "grad_norm": 2.6601760387420654, + "learning_rate": 1.9298820445609438e-05, + "loss": 0.8328, + "step": 589 + }, + { + "epoch": 0.009104759806890982, + "grad_norm": 2.9271795749664307, + "learning_rate": 2.031454783748362e-05, + "loss": 0.8282, + "step": 620 + }, + { + "epoch": 0.00955999779723553, + "grad_norm": 2.3631091117858887, + "learning_rate": 2.13302752293578e-05, + "loss": 0.8314, + "step": 651 + }, + { + "epoch": 0.01001523578758008, + "grad_norm": 2.4002573490142822, + "learning_rate": 2.234600262123198e-05, + "loss": 0.829, + "step": 682 + }, + { + "epoch": 0.010470473777924628, + "grad_norm": 2.629352331161499, + "learning_rate": 2.336173001310616e-05, + "loss": 0.8315, + "step": 713 + }, + { + "epoch": 0.010925711768269179, + "grad_norm": 2.3604094982147217, + "learning_rate": 2.437745740498034e-05, + "loss": 0.8338, + "step": 744 + }, + { + "epoch": 0.011380949758613727, + "grad_norm": 2.8708078861236572, + "learning_rate": 2.5393184796854525e-05, + "loss": 0.843, + "step": 775 + }, + { + "epoch": 0.011836187748958275, + "grad_norm": 2.7403945922851562, + "learning_rate": 2.6408912188728702e-05, + "loss": 0.8372, + "step": 806 + }, + { + "epoch": 0.012291425739302825, + "grad_norm": 2.733816623687744, + "learning_rate": 2.7424639580602886e-05, + "loss": 0.8327, + "step": 837 + }, + { + "epoch": 0.012746663729647374, + "grad_norm": 2.2656960487365723, + "learning_rate": 2.8440366972477066e-05, + "loss": 0.8357, + "step": 868 + }, + { + "epoch": 0.013201901719991924, + "grad_norm": 2.18038010597229, + "learning_rate": 2.9456094364351244e-05, + "loss": 0.8331, + "step": 899 + }, + { + "epoch": 0.013657139710336472, + "grad_norm": 2.497558832168579, + "learning_rate": 3.0471821756225428e-05, + "loss": 0.8271, + "step": 930 + }, + { + "epoch": 0.01411237770068102, + "grad_norm": 2.322632074356079, + "learning_rate": 3.148754914809961e-05, + "loss": 0.8327, + "step": 961 + }, + { + "epoch": 0.014567615691025571, + "grad_norm": 2.5596141815185547, + "learning_rate": 3.2503276539973785e-05, + "loss": 0.8367, + "step": 992 + }, + { + "epoch": 0.01502285368137012, + "grad_norm": 2.1262409687042236, + "learning_rate": 3.351900393184797e-05, + "loss": 0.8192, + "step": 1023 + }, + { + "epoch": 0.01547809167171467, + "grad_norm": 2.2483584880828857, + "learning_rate": 3.453473132372215e-05, + "loss": 0.8352, + "step": 1054 + }, + { + "epoch": 0.015933329662059216, + "grad_norm": 2.224043130874634, + "learning_rate": 3.555045871559633e-05, + "loss": 0.8362, + "step": 1085 + }, + { + "epoch": 0.016388567652403766, + "grad_norm": 2.104788303375244, + "learning_rate": 3.6566186107470514e-05, + "loss": 0.832, + "step": 1116 + }, + { + "epoch": 0.016843805642748316, + "grad_norm": 2.126499652862549, + "learning_rate": 3.7581913499344695e-05, + "loss": 0.8341, + "step": 1147 + }, + { + "epoch": 0.017299043633092866, + "grad_norm": 2.221691370010376, + "learning_rate": 3.8597640891218876e-05, + "loss": 0.8346, + "step": 1178 + }, + { + "epoch": 0.017754281623437413, + "grad_norm": 2.4445159435272217, + "learning_rate": 3.9613368283093056e-05, + "loss": 0.8415, + "step": 1209 + }, + { + "epoch": 0.018209519613781963, + "grad_norm": 2.321516752243042, + "learning_rate": 4.062909567496724e-05, + "loss": 0.8289, + "step": 1240 + }, + { + "epoch": 0.018664757604126513, + "grad_norm": 2.2673656940460205, + "learning_rate": 4.164482306684142e-05, + "loss": 0.8307, + "step": 1271 + }, + { + "epoch": 0.01911999559447106, + "grad_norm": 2.1191513538360596, + "learning_rate": 4.26605504587156e-05, + "loss": 0.8319, + "step": 1302 + }, + { + "epoch": 0.01957523358481561, + "grad_norm": 2.18985652923584, + "learning_rate": 4.367627785058978e-05, + "loss": 0.8398, + "step": 1333 + }, + { + "epoch": 0.02003047157516016, + "grad_norm": 1.9449113607406616, + "learning_rate": 4.469200524246396e-05, + "loss": 0.8304, + "step": 1364 + }, + { + "epoch": 0.020485709565504707, + "grad_norm": 2.0119357109069824, + "learning_rate": 4.570773263433814e-05, + "loss": 0.82, + "step": 1395 + }, + { + "epoch": 0.020940947555849257, + "grad_norm": 2.0013489723205566, + "learning_rate": 4.672346002621232e-05, + "loss": 0.8275, + "step": 1426 + }, + { + "epoch": 0.021396185546193807, + "grad_norm": 2.0175704956054688, + "learning_rate": 4.77391874180865e-05, + "loss": 0.8348, + "step": 1457 + }, + { + "epoch": 0.021851423536538357, + "grad_norm": 2.0555551052093506, + "learning_rate": 4.875491480996068e-05, + "loss": 0.8322, + "step": 1488 + }, + { + "epoch": 0.022306661526882904, + "grad_norm": 1.9871079921722412, + "learning_rate": 4.977064220183487e-05, + "loss": 0.8253, + "step": 1519 + }, + { + "epoch": 0.022761899517227454, + "grad_norm": 2.1991310119628906, + "learning_rate": 4.9999915451558777e-05, + "loss": 0.8252, + "step": 1550 + }, + { + "epoch": 0.023217137507572004, + "grad_norm": 1.96790611743927, + "learning_rate": 4.999955597496219e-05, + "loss": 0.8242, + "step": 1581 + }, + { + "epoch": 0.02367237549791655, + "grad_norm": 2.0572164058685303, + "learning_rate": 4.9998914381774255e-05, + "loss": 0.833, + "step": 1612 + }, + { + "epoch": 0.0241276134882611, + "grad_norm": 2.0381035804748535, + "learning_rate": 4.999799067923527e-05, + "loss": 0.8351, + "step": 1643 + }, + { + "epoch": 0.02458285147860565, + "grad_norm": 2.145235061645508, + "learning_rate": 4.999678487776908e-05, + "loss": 0.842, + "step": 1674 + }, + { + "epoch": 0.025038089468950198, + "grad_norm": 2.0120856761932373, + "learning_rate": 4.9995296990983006e-05, + "loss": 0.838, + "step": 1705 + }, + { + "epoch": 0.025493327459294748, + "grad_norm": 1.9461947679519653, + "learning_rate": 4.999352703566763e-05, + "loss": 0.8269, + "step": 1736 + }, + { + "epoch": 0.025948565449639298, + "grad_norm": 1.9189060926437378, + "learning_rate": 4.999147503179668e-05, + "loss": 0.8214, + "step": 1767 + }, + { + "epoch": 0.026403803439983848, + "grad_norm": 1.906604290008545, + "learning_rate": 4.998914100252672e-05, + "loss": 0.8249, + "step": 1798 + }, + { + "epoch": 0.026859041430328395, + "grad_norm": 2.0810904502868652, + "learning_rate": 4.998652497419696e-05, + "loss": 0.8143, + "step": 1829 + }, + { + "epoch": 0.027314279420672945, + "grad_norm": 1.9607690572738647, + "learning_rate": 4.9983626976328927e-05, + "loss": 0.8155, + "step": 1860 + }, + { + "epoch": 0.027769517411017495, + "grad_norm": 1.8492732048034668, + "learning_rate": 4.998044704162613e-05, + "loss": 0.8168, + "step": 1891 + }, + { + "epoch": 0.02822475540136204, + "grad_norm": 2.0616824626922607, + "learning_rate": 4.9976985205973705e-05, + "loss": 0.8279, + "step": 1922 + }, + { + "epoch": 0.02867999339170659, + "grad_norm": 1.882591724395752, + "learning_rate": 4.997324150843799e-05, + "loss": 0.8199, + "step": 1953 + }, + { + "epoch": 0.029135231382051142, + "grad_norm": 1.8001904487609863, + "learning_rate": 4.99692159912661e-05, + "loss": 0.8232, + "step": 1984 + }, + { + "epoch": 0.02959046937239569, + "grad_norm": 1.94768226146698, + "learning_rate": 4.996490869988546e-05, + "loss": 0.817, + "step": 2015 + }, + { + "epoch": 0.03004570736274024, + "grad_norm": 1.845452904701233, + "learning_rate": 4.996031968290326e-05, + "loss": 0.8146, + "step": 2046 + }, + { + "epoch": 0.03050094535308479, + "grad_norm": 1.8649152517318726, + "learning_rate": 4.995544899210594e-05, + "loss": 0.8246, + "step": 2077 + }, + { + "epoch": 0.03095618334342934, + "grad_norm": 1.665781021118164, + "learning_rate": 4.9950296682458583e-05, + "loss": 0.8004, + "step": 2108 + }, + { + "epoch": 0.031411421333773885, + "grad_norm": 1.6481060981750488, + "learning_rate": 4.994486281210429e-05, + "loss": 0.7949, + "step": 2139 + }, + { + "epoch": 0.03186665932411843, + "grad_norm": 1.7481175661087036, + "learning_rate": 4.9939147442363566e-05, + "loss": 0.8142, + "step": 2170 + }, + { + "epoch": 0.032321897314462986, + "grad_norm": 1.6130069494247437, + "learning_rate": 4.9933150637733574e-05, + "loss": 0.8126, + "step": 2201 + }, + { + "epoch": 0.03277713530480753, + "grad_norm": 1.7536373138427734, + "learning_rate": 4.992687246588743e-05, + "loss": 0.8149, + "step": 2232 + }, + { + "epoch": 0.033232373295152086, + "grad_norm": 1.721200942993164, + "learning_rate": 4.992031299767347e-05, + "loss": 0.8051, + "step": 2263 + }, + { + "epoch": 0.03368761128549663, + "grad_norm": 3.3350446224212646, + "learning_rate": 4.9913472307114386e-05, + "loss": 0.8007, + "step": 2294 + }, + { + "epoch": 0.03414284927584118, + "grad_norm": 1.848281741142273, + "learning_rate": 4.9906350471406446e-05, + "loss": 0.8076, + "step": 2325 + }, + { + "epoch": 0.03459808726618573, + "grad_norm": 1.6283164024353027, + "learning_rate": 4.989894757091861e-05, + "loss": 0.8084, + "step": 2356 + }, + { + "epoch": 0.03505332525653028, + "grad_norm": 1.6567096710205078, + "learning_rate": 4.989126368919158e-05, + "loss": 0.8075, + "step": 2387 + }, + { + "epoch": 0.035508563246874826, + "grad_norm": 1.6488817930221558, + "learning_rate": 4.988329891293693e-05, + "loss": 0.7994, + "step": 2418 + }, + { + "epoch": 0.03596380123721938, + "grad_norm": 1.9465230703353882, + "learning_rate": 4.987505333203608e-05, + "loss": 0.8064, + "step": 2449 + }, + { + "epoch": 0.036419039227563926, + "grad_norm": 1.722517967224121, + "learning_rate": 4.9866527039539276e-05, + "loss": 0.8005, + "step": 2480 + }, + { + "epoch": 0.03687427721790847, + "grad_norm": 1.8509281873703003, + "learning_rate": 4.9857720131664594e-05, + "loss": 0.8094, + "step": 2511 + }, + { + "epoch": 0.03732951520825303, + "grad_norm": 1.7140249013900757, + "learning_rate": 4.9848632707796773e-05, + "loss": 0.8018, + "step": 2542 + }, + { + "epoch": 0.03778475319859757, + "grad_norm": 1.9315160512924194, + "learning_rate": 4.9839264870486155e-05, + "loss": 0.8094, + "step": 2573 + }, + { + "epoch": 0.03823999118894212, + "grad_norm": 1.728063702583313, + "learning_rate": 4.9829616725447526e-05, + "loss": 0.8052, + "step": 2604 + }, + { + "epoch": 0.03869522917928667, + "grad_norm": 1.588205099105835, + "learning_rate": 4.981968838155888e-05, + "loss": 0.7983, + "step": 2635 + }, + { + "epoch": 0.03915046716963122, + "grad_norm": 1.6086421012878418, + "learning_rate": 4.980947995086024e-05, + "loss": 0.7961, + "step": 2666 + }, + { + "epoch": 0.03960570515997577, + "grad_norm": 1.680635929107666, + "learning_rate": 4.979899154855234e-05, + "loss": 0.8033, + "step": 2697 + }, + { + "epoch": 0.04006094315032032, + "grad_norm": 1.6833465099334717, + "learning_rate": 4.9788223292995386e-05, + "loss": 0.793, + "step": 2728 + }, + { + "epoch": 0.04051618114066487, + "grad_norm": 1.5738922357559204, + "learning_rate": 4.977717530570768e-05, + "loss": 0.7837, + "step": 2759 + }, + { + "epoch": 0.040971419131009414, + "grad_norm": 1.6513015031814575, + "learning_rate": 4.976584771136425e-05, + "loss": 0.7969, + "step": 2790 + }, + { + "epoch": 0.04142665712135397, + "grad_norm": 1.7180182933807373, + "learning_rate": 4.975424063779547e-05, + "loss": 0.7994, + "step": 2821 + }, + { + "epoch": 0.041881895111698514, + "grad_norm": 1.5236577987670898, + "learning_rate": 4.974235421598557e-05, + "loss": 0.7952, + "step": 2852 + }, + { + "epoch": 0.04233713310204307, + "grad_norm": 1.760188341140747, + "learning_rate": 4.973018858007122e-05, + "loss": 0.795, + "step": 2883 + }, + { + "epoch": 0.042792371092387614, + "grad_norm": 1.5314075946807861, + "learning_rate": 4.9717743867339963e-05, + "loss": 0.7884, + "step": 2914 + }, + { + "epoch": 0.04324760908273216, + "grad_norm": 1.7256252765655518, + "learning_rate": 4.9705020218228695e-05, + "loss": 0.7883, + "step": 2945 + }, + { + "epoch": 0.043702847073076714, + "grad_norm": 1.6262825727462769, + "learning_rate": 4.969201777632205e-05, + "loss": 0.8007, + "step": 2976 + }, + { + "epoch": 0.04415808506342126, + "grad_norm": 1.4939130544662476, + "learning_rate": 4.9678736688350846e-05, + "loss": 0.7862, + "step": 3007 + }, + { + "epoch": 0.04461332305376581, + "grad_norm": 1.6776609420776367, + "learning_rate": 4.966517710419033e-05, + "loss": 0.7977, + "step": 3038 + }, + { + "epoch": 0.04506856104411036, + "grad_norm": 1.7425099611282349, + "learning_rate": 4.965133917685858e-05, + "loss": 0.789, + "step": 3069 + }, + { + "epoch": 0.04552379903445491, + "grad_norm": 1.5114233493804932, + "learning_rate": 4.9637223062514714e-05, + "loss": 0.7915, + "step": 3100 + }, + { + "epoch": 0.045979037024799455, + "grad_norm": 1.6397335529327393, + "learning_rate": 4.962282892045718e-05, + "loss": 0.8127, + "step": 3131 + }, + { + "epoch": 0.04643427501514401, + "grad_norm": 1.7266038656234741, + "learning_rate": 4.9608156913121904e-05, + "loss": 0.827, + "step": 3162 + }, + { + "epoch": 0.046889513005488555, + "grad_norm": 1.5216429233551025, + "learning_rate": 4.959320720608049e-05, + "loss": 0.8146, + "step": 3193 + }, + { + "epoch": 0.0473447509958331, + "grad_norm": 1.9235813617706299, + "learning_rate": 4.9577979968038354e-05, + "loss": 0.8118, + "step": 3224 + }, + { + "epoch": 0.047799988986177655, + "grad_norm": 1.6768368482589722, + "learning_rate": 4.956247537083282e-05, + "loss": 0.7925, + "step": 3255 + }, + { + "epoch": 0.0482552269765222, + "grad_norm": 1.6357700824737549, + "learning_rate": 4.9546693589431145e-05, + "loss": 0.7743, + "step": 3286 + }, + { + "epoch": 0.04871046496686675, + "grad_norm": 1.6823676824569702, + "learning_rate": 4.9530634801928595e-05, + "loss": 0.7949, + "step": 3317 + }, + { + "epoch": 0.0491657029572113, + "grad_norm": 1.4716497659683228, + "learning_rate": 4.9514299189546395e-05, + "loss": 0.7755, + "step": 3348 + }, + { + "epoch": 0.04962094094755585, + "grad_norm": 3.1948273181915283, + "learning_rate": 4.949768693662973e-05, + "loss": 0.7867, + "step": 3379 + }, + { + "epoch": 0.050076178937900395, + "grad_norm": 1.7412703037261963, + "learning_rate": 4.948079823064559e-05, + "loss": 0.788, + "step": 3410 + }, + { + "epoch": 0.05053141692824495, + "grad_norm": 1.5544873476028442, + "learning_rate": 4.946363326218074e-05, + "loss": 0.7752, + "step": 3441 + }, + { + "epoch": 0.050986654918589495, + "grad_norm": 1.5616456270217896, + "learning_rate": 4.9446192224939525e-05, + "loss": 0.7752, + "step": 3472 + }, + { + "epoch": 0.05144189290893404, + "grad_norm": 1.4451948404312134, + "learning_rate": 4.942847531574167e-05, + "loss": 0.7786, + "step": 3503 + }, + { + "epoch": 0.051897130899278596, + "grad_norm": 1.5473634004592896, + "learning_rate": 4.941048273452008e-05, + "loss": 0.7824, + "step": 3534 + }, + { + "epoch": 0.05235236888962314, + "grad_norm": 1.4772653579711914, + "learning_rate": 4.9392214684318605e-05, + "loss": 0.7873, + "step": 3565 + }, + { + "epoch": 0.052807606879967696, + "grad_norm": 1.4708329439163208, + "learning_rate": 4.93736713712897e-05, + "loss": 0.7851, + "step": 3596 + }, + { + "epoch": 0.05326284487031224, + "grad_norm": 1.469369649887085, + "learning_rate": 4.9354853004692124e-05, + "loss": 0.7874, + "step": 3627 + }, + { + "epoch": 0.05371808286065679, + "grad_norm": 1.3600234985351562, + "learning_rate": 4.93357597968886e-05, + "loss": 0.7792, + "step": 3658 + }, + { + "epoch": 0.05417332085100134, + "grad_norm": 1.6479856967926025, + "learning_rate": 4.931639196334338e-05, + "loss": 0.7886, + "step": 3689 + }, + { + "epoch": 0.05462855884134589, + "grad_norm": 1.5576198101043701, + "learning_rate": 4.9296749722619826e-05, + "loss": 0.7873, + "step": 3720 + }, + { + "epoch": 0.055083796831690436, + "grad_norm": 1.5949997901916504, + "learning_rate": 4.9276833296377966e-05, + "loss": 0.7794, + "step": 3751 + }, + { + "epoch": 0.05553903482203499, + "grad_norm": 1.3033227920532227, + "learning_rate": 4.925664290937196e-05, + "loss": 0.7763, + "step": 3782 + }, + { + "epoch": 0.055994272812379536, + "grad_norm": 1.5083190202713013, + "learning_rate": 4.9236178789447576e-05, + "loss": 0.7796, + "step": 3813 + }, + { + "epoch": 0.05644951080272408, + "grad_norm": 1.4691548347473145, + "learning_rate": 4.921544116753962e-05, + "loss": 0.7744, + "step": 3844 + }, + { + "epoch": 0.05690474879306864, + "grad_norm": 1.4447206258773804, + "learning_rate": 4.919443027766935e-05, + "loss": 0.7812, + "step": 3875 + }, + { + "epoch": 0.05735998678341318, + "grad_norm": 1.4616878032684326, + "learning_rate": 4.91731463569418e-05, + "loss": 0.7824, + "step": 3906 + }, + { + "epoch": 0.05781522477375773, + "grad_norm": 1.4808290004730225, + "learning_rate": 4.915158964554312e-05, + "loss": 0.7777, + "step": 3937 + }, + { + "epoch": 0.058270462764102283, + "grad_norm": 1.4712798595428467, + "learning_rate": 4.912976038673786e-05, + "loss": 0.7801, + "step": 3968 + }, + { + "epoch": 0.05872570075444683, + "grad_norm": 1.4899425506591797, + "learning_rate": 4.9107658826866254e-05, + "loss": 0.7852, + "step": 3999 + }, + { + "epoch": 0.05918093874479138, + "grad_norm": 1.2921983003616333, + "learning_rate": 4.908528521534139e-05, + "loss": 0.7735, + "step": 4030 + }, + { + "epoch": 0.05963617673513593, + "grad_norm": 1.3849194049835205, + "learning_rate": 4.906263980464644e-05, + "loss": 0.7893, + "step": 4061 + }, + { + "epoch": 0.06009141472548048, + "grad_norm": 1.442808747291565, + "learning_rate": 4.903972285033178e-05, + "loss": 0.784, + "step": 4092 + }, + { + "epoch": 0.060546652715825024, + "grad_norm": 1.4925036430358887, + "learning_rate": 4.901653461101213e-05, + "loss": 0.7714, + "step": 4123 + }, + { + "epoch": 0.06100189070616958, + "grad_norm": 1.4630024433135986, + "learning_rate": 4.8993075348363626e-05, + "loss": 0.7677, + "step": 4154 + }, + { + "epoch": 0.061457128696514124, + "grad_norm": 1.344081163406372, + "learning_rate": 4.896934532712084e-05, + "loss": 0.7788, + "step": 4185 + }, + { + "epoch": 0.06191236668685868, + "grad_norm": 1.449648380279541, + "learning_rate": 4.8945344815073846e-05, + "loss": 0.7661, + "step": 4216 + }, + { + "epoch": 0.062367604677203224, + "grad_norm": 1.450361728668213, + "learning_rate": 4.892107408306516e-05, + "loss": 0.7698, + "step": 4247 + }, + { + "epoch": 0.06282284266754777, + "grad_norm": 1.5068191289901733, + "learning_rate": 4.889653340498669e-05, + "loss": 0.7758, + "step": 4278 + }, + { + "epoch": 0.06327808065789232, + "grad_norm": 1.3638824224472046, + "learning_rate": 4.8871723057776664e-05, + "loss": 0.7702, + "step": 4309 + }, + { + "epoch": 0.06373331864823686, + "grad_norm": 1.4669525623321533, + "learning_rate": 4.8846643321416476e-05, + "loss": 0.7755, + "step": 4340 + }, + { + "epoch": 0.06418855663858142, + "grad_norm": 1.502684235572815, + "learning_rate": 4.882129447892753e-05, + "loss": 0.7677, + "step": 4371 + }, + { + "epoch": 0.06464379462892597, + "grad_norm": 1.5910828113555908, + "learning_rate": 4.8795676816368076e-05, + "loss": 0.7646, + "step": 4402 + }, + { + "epoch": 0.06509903261927051, + "grad_norm": 1.4607537984848022, + "learning_rate": 4.876979062282995e-05, + "loss": 0.7684, + "step": 4433 + }, + { + "epoch": 0.06555427060961506, + "grad_norm": 1.3823063373565674, + "learning_rate": 4.8743636190435325e-05, + "loss": 0.7695, + "step": 4464 + }, + { + "epoch": 0.06600950859995962, + "grad_norm": 1.3769381046295166, + "learning_rate": 4.871721381433344e-05, + "loss": 0.7746, + "step": 4495 + }, + { + "epoch": 0.06646474659030417, + "grad_norm": 1.3154246807098389, + "learning_rate": 4.869052379269719e-05, + "loss": 0.7699, + "step": 4526 + }, + { + "epoch": 0.06691998458064871, + "grad_norm": 1.4395617246627808, + "learning_rate": 4.866356642671985e-05, + "loss": 0.7654, + "step": 4557 + }, + { + "epoch": 0.06737522257099327, + "grad_norm": 1.333095908164978, + "learning_rate": 4.8636342020611634e-05, + "loss": 0.7729, + "step": 4588 + }, + { + "epoch": 0.06783046056133782, + "grad_norm": 1.3427667617797852, + "learning_rate": 4.860885088159626e-05, + "loss": 0.7701, + "step": 4619 + }, + { + "epoch": 0.06828569855168236, + "grad_norm": 1.4095017910003662, + "learning_rate": 4.858109331990751e-05, + "loss": 0.7711, + "step": 4650 + }, + { + "epoch": 0.06874093654202691, + "grad_norm": 1.4199680089950562, + "learning_rate": 4.855306964878567e-05, + "loss": 0.7797, + "step": 4681 + }, + { + "epoch": 0.06919617453237147, + "grad_norm": 1.3505098819732666, + "learning_rate": 4.8524780184474084e-05, + "loss": 0.7702, + "step": 4712 + }, + { + "epoch": 0.069651412522716, + "grad_norm": 1.3765984773635864, + "learning_rate": 4.8496225246215496e-05, + "loss": 0.7647, + "step": 4743 + }, + { + "epoch": 0.07010665051306056, + "grad_norm": 1.539703369140625, + "learning_rate": 4.8467405156248505e-05, + "loss": 0.7673, + "step": 4774 + }, + { + "epoch": 0.07056188850340511, + "grad_norm": 1.6388696432113647, + "learning_rate": 4.843832023980392e-05, + "loss": 0.7672, + "step": 4805 + }, + { + "epoch": 0.07101712649374965, + "grad_norm": 1.8714436292648315, + "learning_rate": 4.840897082510106e-05, + "loss": 0.7661, + "step": 4836 + }, + { + "epoch": 0.0714723644840942, + "grad_norm": 1.5175424814224243, + "learning_rate": 4.8379357243344084e-05, + "loss": 0.7709, + "step": 4867 + }, + { + "epoch": 0.07192760247443876, + "grad_norm": 1.3103501796722412, + "learning_rate": 4.8349479828718236e-05, + "loss": 0.7701, + "step": 4898 + }, + { + "epoch": 0.0723828404647833, + "grad_norm": 1.4692376852035522, + "learning_rate": 4.8319338918386075e-05, + "loss": 0.7602, + "step": 4929 + }, + { + "epoch": 0.07283807845512785, + "grad_norm": 1.390416145324707, + "learning_rate": 4.828893485248369e-05, + "loss": 0.7896, + "step": 4960 + }, + { + "epoch": 0.0732933164454724, + "grad_norm": 1.4157830476760864, + "learning_rate": 4.825826797411682e-05, + "loss": 0.7675, + "step": 4991 + }, + { + "epoch": 0.07374855443581695, + "grad_norm": 1.2405292987823486, + "learning_rate": 4.822733862935702e-05, + "loss": 0.7679, + "step": 5022 + }, + { + "epoch": 0.0742037924261615, + "grad_norm": 1.4902536869049072, + "learning_rate": 4.819614716723775e-05, + "loss": 0.7693, + "step": 5053 + }, + { + "epoch": 0.07465903041650605, + "grad_norm": 1.4391045570373535, + "learning_rate": 4.8164693939750425e-05, + "loss": 0.7699, + "step": 5084 + }, + { + "epoch": 0.07511426840685059, + "grad_norm": 1.4432806968688965, + "learning_rate": 4.813297930184042e-05, + "loss": 0.763, + "step": 5115 + }, + { + "epoch": 0.07556950639719515, + "grad_norm": 1.3630146980285645, + "learning_rate": 4.810100361140314e-05, + "loss": 0.7618, + "step": 5146 + }, + { + "epoch": 0.0760247443875397, + "grad_norm": 1.5038788318634033, + "learning_rate": 4.8068767229279885e-05, + "loss": 0.7685, + "step": 5177 + }, + { + "epoch": 0.07647998237788424, + "grad_norm": 1.4100326299667358, + "learning_rate": 4.8036270519253854e-05, + "loss": 0.7645, + "step": 5208 + }, + { + "epoch": 0.0769352203682288, + "grad_norm": 1.377821445465088, + "learning_rate": 4.8003513848046e-05, + "loss": 0.7643, + "step": 5239 + }, + { + "epoch": 0.07739045835857335, + "grad_norm": 1.3848881721496582, + "learning_rate": 4.79704975853109e-05, + "loss": 0.7711, + "step": 5270 + }, + { + "epoch": 0.07784569634891789, + "grad_norm": 1.6327494382858276, + "learning_rate": 4.793722210363262e-05, + "loss": 0.7591, + "step": 5301 + }, + { + "epoch": 0.07830093433926244, + "grad_norm": 1.51273512840271, + "learning_rate": 4.7903687778520414e-05, + "loss": 0.7697, + "step": 5332 + }, + { + "epoch": 0.078756172329607, + "grad_norm": 1.447487235069275, + "learning_rate": 4.7869894988404593e-05, + "loss": 0.7602, + "step": 5363 + }, + { + "epoch": 0.07921141031995153, + "grad_norm": 1.3227125406265259, + "learning_rate": 4.783584411463221e-05, + "loss": 0.7616, + "step": 5394 + }, + { + "epoch": 0.07966664831029609, + "grad_norm": 1.4407910108566284, + "learning_rate": 4.780153554146274e-05, + "loss": 0.7618, + "step": 5425 + }, + { + "epoch": 0.08012188630064064, + "grad_norm": 1.2349289655685425, + "learning_rate": 4.7766969656063766e-05, + "loss": 0.7688, + "step": 5456 + }, + { + "epoch": 0.08057712429098518, + "grad_norm": 1.3858132362365723, + "learning_rate": 4.773214684850662e-05, + "loss": 0.7517, + "step": 5487 + }, + { + "epoch": 0.08103236228132973, + "grad_norm": 1.2951774597167969, + "learning_rate": 4.769706751176193e-05, + "loss": 0.7588, + "step": 5518 + }, + { + "epoch": 0.08148760027167429, + "grad_norm": 1.2295372486114502, + "learning_rate": 4.7661732041695264e-05, + "loss": 0.7522, + "step": 5549 + }, + { + "epoch": 0.08194283826201883, + "grad_norm": 1.4258685111999512, + "learning_rate": 4.762614083706258e-05, + "loss": 0.7666, + "step": 5580 + }, + { + "epoch": 0.08239807625236338, + "grad_norm": 1.5323545932769775, + "learning_rate": 4.759029429950581e-05, + "loss": 0.7587, + "step": 5611 + }, + { + "epoch": 0.08285331424270793, + "grad_norm": 1.3712198734283447, + "learning_rate": 4.7554192833548235e-05, + "loss": 0.7597, + "step": 5642 + }, + { + "epoch": 0.08330855223305247, + "grad_norm": 1.389145016670227, + "learning_rate": 4.751783684659e-05, + "loss": 0.7484, + "step": 5673 + }, + { + "epoch": 0.08376379022339703, + "grad_norm": 1.466799020767212, + "learning_rate": 4.748122674890348e-05, + "loss": 0.7586, + "step": 5704 + }, + { + "epoch": 0.08421902821374158, + "grad_norm": 1.4522569179534912, + "learning_rate": 4.7444362953628654e-05, + "loss": 0.7615, + "step": 5735 + }, + { + "epoch": 0.08467426620408613, + "grad_norm": 1.6374691724777222, + "learning_rate": 4.7407245876768424e-05, + "loss": 0.7573, + "step": 5766 + }, + { + "epoch": 0.08512950419443067, + "grad_norm": 1.384748101234436, + "learning_rate": 4.736987593718397e-05, + "loss": 0.7516, + "step": 5797 + }, + { + "epoch": 0.08558474218477523, + "grad_norm": 1.366335153579712, + "learning_rate": 4.733225355658999e-05, + "loss": 0.7681, + "step": 5828 + }, + { + "epoch": 0.08603998017511978, + "grad_norm": 1.3855708837509155, + "learning_rate": 4.7294379159549926e-05, + "loss": 0.7557, + "step": 5859 + }, + { + "epoch": 0.08649521816546432, + "grad_norm": 1.3614453077316284, + "learning_rate": 4.725625317347119e-05, + "loss": 0.7624, + "step": 5890 + }, + { + "epoch": 0.08695045615580888, + "grad_norm": 1.2072994709014893, + "learning_rate": 4.7217876028600374e-05, + "loss": 0.7545, + "step": 5921 + }, + { + "epoch": 0.08740569414615343, + "grad_norm": 1.5530472993850708, + "learning_rate": 4.717924815801832e-05, + "loss": 0.7574, + "step": 5952 + }, + { + "epoch": 0.08786093213649797, + "grad_norm": 1.3522703647613525, + "learning_rate": 4.714036999763532e-05, + "loss": 0.7608, + "step": 5983 + }, + { + "epoch": 0.08831617012684252, + "grad_norm": 1.342637300491333, + "learning_rate": 4.7101241986186116e-05, + "loss": 0.7519, + "step": 6014 + }, + { + "epoch": 0.08877140811718708, + "grad_norm": 1.2783684730529785, + "learning_rate": 4.7061864565225e-05, + "loss": 0.7565, + "step": 6045 + }, + { + "epoch": 0.08922664610753162, + "grad_norm": 1.1245291233062744, + "learning_rate": 4.702223817912081e-05, + "loss": 0.7655, + "step": 6076 + }, + { + "epoch": 0.08968188409787617, + "grad_norm": 1.2777146100997925, + "learning_rate": 4.698236327505195e-05, + "loss": 0.7526, + "step": 6107 + }, + { + "epoch": 0.09013712208822072, + "grad_norm": 1.2739795446395874, + "learning_rate": 4.694224030300127e-05, + "loss": 0.7594, + "step": 6138 + }, + { + "epoch": 0.09059236007856526, + "grad_norm": 1.358684778213501, + "learning_rate": 4.690186971575107e-05, + "loss": 0.7599, + "step": 6169 + }, + { + "epoch": 0.09104759806890982, + "grad_norm": 1.2908906936645508, + "learning_rate": 4.6861251968877916e-05, + "loss": 0.7564, + "step": 6200 + }, + { + "epoch": 0.09150283605925437, + "grad_norm": 1.2198729515075684, + "learning_rate": 4.68203875207476e-05, + "loss": 0.7595, + "step": 6231 + }, + { + "epoch": 0.09195807404959891, + "grad_norm": 1.337134599685669, + "learning_rate": 4.677927683250983e-05, + "loss": 0.7512, + "step": 6262 + }, + { + "epoch": 0.09241331203994346, + "grad_norm": 1.4034923315048218, + "learning_rate": 4.6737920368093156e-05, + "loss": 0.7587, + "step": 6293 + }, + { + "epoch": 0.09286855003028802, + "grad_norm": 1.2584336996078491, + "learning_rate": 4.669631859419965e-05, + "loss": 0.7532, + "step": 6324 + }, + { + "epoch": 0.09332378802063256, + "grad_norm": 1.4327120780944824, + "learning_rate": 4.6654471980299676e-05, + "loss": 0.7528, + "step": 6355 + }, + { + "epoch": 0.09377902601097711, + "grad_norm": 1.386634111404419, + "learning_rate": 4.661238099862658e-05, + "loss": 0.7455, + "step": 6386 + }, + { + "epoch": 0.09423426400132166, + "grad_norm": 1.373677372932434, + "learning_rate": 4.657004612417138e-05, + "loss": 0.7519, + "step": 6417 + }, + { + "epoch": 0.0946895019916662, + "grad_norm": 1.3846200704574585, + "learning_rate": 4.6527467834677374e-05, + "loss": 0.7505, + "step": 6448 + }, + { + "epoch": 0.09514473998201076, + "grad_norm": 1.3237924575805664, + "learning_rate": 4.648464661063478e-05, + "loss": 0.7562, + "step": 6479 + }, + { + "epoch": 0.09559997797235531, + "grad_norm": 1.3368539810180664, + "learning_rate": 4.6441582935275264e-05, + "loss": 0.7523, + "step": 6510 + }, + { + "epoch": 0.09605521596269985, + "grad_norm": 1.1844121217727661, + "learning_rate": 4.6398277294566586e-05, + "loss": 0.7535, + "step": 6541 + }, + { + "epoch": 0.0965104539530444, + "grad_norm": 1.3107521533966064, + "learning_rate": 4.6354730177207e-05, + "loss": 0.7609, + "step": 6572 + }, + { + "epoch": 0.09696569194338896, + "grad_norm": 1.3678165674209595, + "learning_rate": 4.6310942074619787e-05, + "loss": 0.7513, + "step": 6603 + }, + { + "epoch": 0.0974209299337335, + "grad_norm": 1.2169839143753052, + "learning_rate": 4.626691348094777e-05, + "loss": 0.7464, + "step": 6634 + }, + { + "epoch": 0.09787616792407805, + "grad_norm": 1.5456466674804688, + "learning_rate": 4.622264489304762e-05, + "loss": 0.7431, + "step": 6665 + }, + { + "epoch": 0.0983314059144226, + "grad_norm": 1.3606510162353516, + "learning_rate": 4.617813681048434e-05, + "loss": 0.7481, + "step": 6696 + }, + { + "epoch": 0.09878664390476714, + "grad_norm": 1.1854835748672485, + "learning_rate": 4.61333897355256e-05, + "loss": 0.7491, + "step": 6727 + }, + { + "epoch": 0.0992418818951117, + "grad_norm": 1.1682158708572388, + "learning_rate": 4.608840417313604e-05, + "loss": 0.7433, + "step": 6758 + }, + { + "epoch": 0.09969711988545625, + "grad_norm": 1.3889473676681519, + "learning_rate": 4.6043180630971646e-05, + "loss": 0.7414, + "step": 6789 + }, + { + "epoch": 0.10015235787580079, + "grad_norm": 1.351807713508606, + "learning_rate": 4.599771961937391e-05, + "loss": 0.7501, + "step": 6820 + }, + { + "epoch": 0.10060759586614534, + "grad_norm": 1.4737526178359985, + "learning_rate": 4.5952021651364204e-05, + "loss": 0.7508, + "step": 6851 + }, + { + "epoch": 0.1010628338564899, + "grad_norm": 1.3223670721054077, + "learning_rate": 4.590608724263786e-05, + "loss": 0.7557, + "step": 6882 + }, + { + "epoch": 0.10151807184683444, + "grad_norm": 1.182531714439392, + "learning_rate": 4.585991691155845e-05, + "loss": 0.7595, + "step": 6913 + }, + { + "epoch": 0.10197330983717899, + "grad_norm": 1.3309706449508667, + "learning_rate": 4.581351117915188e-05, + "loss": 0.7516, + "step": 6944 + }, + { + "epoch": 0.10242854782752354, + "grad_norm": 1.1631003618240356, + "learning_rate": 4.5766870569100534e-05, + "loss": 0.7538, + "step": 6975 + }, + { + "epoch": 0.10288378581786808, + "grad_norm": 1.150242805480957, + "learning_rate": 4.571999560773736e-05, + "loss": 0.7583, + "step": 7006 + }, + { + "epoch": 0.10333902380821264, + "grad_norm": 1.2031753063201904, + "learning_rate": 4.5672886824039915e-05, + "loss": 0.743, + "step": 7037 + }, + { + "epoch": 0.10379426179855719, + "grad_norm": 1.3973835706710815, + "learning_rate": 4.5625544749624435e-05, + "loss": 0.7477, + "step": 7068 + }, + { + "epoch": 0.10424949978890174, + "grad_norm": 1.3755607604980469, + "learning_rate": 4.5577969918739794e-05, + "loss": 0.7424, + "step": 7099 + }, + { + "epoch": 0.10470473777924628, + "grad_norm": 1.2959696054458618, + "learning_rate": 4.5530162868261486e-05, + "loss": 0.7432, + "step": 7130 + }, + { + "epoch": 0.10515997576959084, + "grad_norm": 1.4843213558197021, + "learning_rate": 4.548212413768558e-05, + "loss": 0.7519, + "step": 7161 + }, + { + "epoch": 0.10561521375993539, + "grad_norm": 1.3697874546051025, + "learning_rate": 4.543385426912261e-05, + "loss": 0.7546, + "step": 7192 + }, + { + "epoch": 0.10607045175027993, + "grad_norm": 1.3163423538208008, + "learning_rate": 4.53853538072915e-05, + "loss": 0.7328, + "step": 7223 + }, + { + "epoch": 0.10652568974062449, + "grad_norm": 1.200897455215454, + "learning_rate": 4.533662329951336e-05, + "loss": 0.7438, + "step": 7254 + }, + { + "epoch": 0.10698092773096904, + "grad_norm": 1.3377361297607422, + "learning_rate": 4.528766329570536e-05, + "loss": 0.7478, + "step": 7285 + }, + { + "epoch": 0.10743616572131358, + "grad_norm": 1.2304787635803223, + "learning_rate": 4.523847434837447e-05, + "loss": 0.7468, + "step": 7316 + }, + { + "epoch": 0.10789140371165813, + "grad_norm": 1.1947201490402222, + "learning_rate": 4.518905701261128e-05, + "loss": 0.7483, + "step": 7347 + }, + { + "epoch": 0.10834664170200269, + "grad_norm": 1.1942620277404785, + "learning_rate": 4.5139411846083715e-05, + "loss": 0.7522, + "step": 7378 + }, + { + "epoch": 0.10880187969234723, + "grad_norm": 1.2765998840332031, + "learning_rate": 4.508953940903073e-05, + "loss": 0.7425, + "step": 7409 + }, + { + "epoch": 0.10925711768269178, + "grad_norm": 1.1460707187652588, + "learning_rate": 4.5039440264255994e-05, + "loss": 0.7534, + "step": 7440 + }, + { + "epoch": 0.10971235567303633, + "grad_norm": 1.1739740371704102, + "learning_rate": 4.498911497712155e-05, + "loss": 0.7447, + "step": 7471 + }, + { + "epoch": 0.11016759366338087, + "grad_norm": 1.227232575416565, + "learning_rate": 4.493856411554142e-05, + "loss": 0.7452, + "step": 7502 + }, + { + "epoch": 0.11062283165372543, + "grad_norm": 1.3812710046768188, + "learning_rate": 4.4887788249975206e-05, + "loss": 0.7431, + "step": 7533 + }, + { + "epoch": 0.11107806964406998, + "grad_norm": 1.1743565797805786, + "learning_rate": 4.4836787953421656e-05, + "loss": 0.7421, + "step": 7564 + }, + { + "epoch": 0.11153330763441452, + "grad_norm": 1.2436307668685913, + "learning_rate": 4.478556380141218e-05, + "loss": 0.7461, + "step": 7595 + }, + { + "epoch": 0.11198854562475907, + "grad_norm": 1.2472264766693115, + "learning_rate": 4.4734116372004375e-05, + "loss": 0.7456, + "step": 7626 + }, + { + "epoch": 0.11244378361510363, + "grad_norm": 1.2209473848342896, + "learning_rate": 4.4682446245775477e-05, + "loss": 0.7544, + "step": 7657 + }, + { + "epoch": 0.11289902160544817, + "grad_norm": 1.3582866191864014, + "learning_rate": 4.463055400581586e-05, + "loss": 0.7464, + "step": 7688 + }, + { + "epoch": 0.11335425959579272, + "grad_norm": 1.2489479780197144, + "learning_rate": 4.4578440237722374e-05, + "loss": 0.7394, + "step": 7719 + }, + { + "epoch": 0.11380949758613727, + "grad_norm": 1.2384752035140991, + "learning_rate": 4.452610552959183e-05, + "loss": 0.7358, + "step": 7750 + }, + { + "epoch": 0.11426473557648181, + "grad_norm": 1.3618046045303345, + "learning_rate": 4.447355047201428e-05, + "loss": 0.742, + "step": 7781 + }, + { + "epoch": 0.11471997356682637, + "grad_norm": 1.2598398923873901, + "learning_rate": 4.4420775658066414e-05, + "loss": 0.7407, + "step": 7812 + }, + { + "epoch": 0.11517521155717092, + "grad_norm": 1.1453444957733154, + "learning_rate": 4.436778168330484e-05, + "loss": 0.7499, + "step": 7843 + }, + { + "epoch": 0.11563044954751546, + "grad_norm": 1.3463783264160156, + "learning_rate": 4.4314569145759353e-05, + "loss": 0.7388, + "step": 7874 + }, + { + "epoch": 0.11608568753786001, + "grad_norm": 1.2431267499923706, + "learning_rate": 4.42611386459262e-05, + "loss": 0.7487, + "step": 7905 + }, + { + "epoch": 0.11654092552820457, + "grad_norm": 1.2726207971572876, + "learning_rate": 4.420749078676133e-05, + "loss": 0.7468, + "step": 7936 + }, + { + "epoch": 0.1169961635185491, + "grad_norm": 1.1979873180389404, + "learning_rate": 4.4153626173673516e-05, + "loss": 0.7482, + "step": 7967 + }, + { + "epoch": 0.11745140150889366, + "grad_norm": 1.2321964502334595, + "learning_rate": 4.409954541451762e-05, + "loss": 0.7363, + "step": 7998 + }, + { + "epoch": 0.11790663949923821, + "grad_norm": 1.265199065208435, + "learning_rate": 4.404524911958764e-05, + "loss": 0.7466, + "step": 8029 + }, + { + "epoch": 0.11836187748958275, + "grad_norm": 1.3418115377426147, + "learning_rate": 4.399073790160989e-05, + "loss": 0.7433, + "step": 8060 + }, + { + "epoch": 0.11881711547992731, + "grad_norm": 1.276666522026062, + "learning_rate": 4.393601237573607e-05, + "loss": 0.7457, + "step": 8091 + }, + { + "epoch": 0.11927235347027186, + "grad_norm": 1.3559373617172241, + "learning_rate": 4.388107315953628e-05, + "loss": 0.7451, + "step": 8122 + }, + { + "epoch": 0.1197275914606164, + "grad_norm": 1.3017044067382812, + "learning_rate": 4.382592087299212e-05, + "loss": 0.736, + "step": 8153 + }, + { + "epoch": 0.12018282945096095, + "grad_norm": 1.230873703956604, + "learning_rate": 4.377055613848964e-05, + "loss": 0.7409, + "step": 8184 + }, + { + "epoch": 0.12063806744130551, + "grad_norm": 1.3415979146957397, + "learning_rate": 4.3714979580812355e-05, + "loss": 0.7456, + "step": 8215 + }, + { + "epoch": 0.12109330543165005, + "grad_norm": 1.3706104755401611, + "learning_rate": 4.365919182713416e-05, + "loss": 0.7425, + "step": 8246 + }, + { + "epoch": 0.1215485434219946, + "grad_norm": 1.3071256875991821, + "learning_rate": 4.360319350701226e-05, + "loss": 0.7312, + "step": 8277 + }, + { + "epoch": 0.12200378141233915, + "grad_norm": 1.1925092935562134, + "learning_rate": 4.3546985252380115e-05, + "loss": 0.7457, + "step": 8308 + }, + { + "epoch": 0.12245901940268371, + "grad_norm": 1.2254852056503296, + "learning_rate": 4.349056769754021e-05, + "loss": 0.7454, + "step": 8339 + }, + { + "epoch": 0.12291425739302825, + "grad_norm": 1.2121927738189697, + "learning_rate": 4.3433941479156994e-05, + "loss": 0.7451, + "step": 8370 + }, + { + "epoch": 0.1233694953833728, + "grad_norm": 1.4518349170684814, + "learning_rate": 4.3377107236249647e-05, + "loss": 0.7376, + "step": 8401 + }, + { + "epoch": 0.12382473337371736, + "grad_norm": 1.2487318515777588, + "learning_rate": 4.332006561018488e-05, + "loss": 0.7468, + "step": 8432 + }, + { + "epoch": 0.1242799713640619, + "grad_norm": 1.2601664066314697, + "learning_rate": 4.3262817244669683e-05, + "loss": 0.7341, + "step": 8463 + }, + { + "epoch": 0.12473520935440645, + "grad_norm": 2.2060904502868652, + "learning_rate": 4.3205362785744083e-05, + "loss": 0.7421, + "step": 8494 + }, + { + "epoch": 0.125190447344751, + "grad_norm": 1.2388501167297363, + "learning_rate": 4.314770288177384e-05, + "loss": 0.7372, + "step": 8525 + }, + { + "epoch": 0.12564568533509554, + "grad_norm": 1.20314359664917, + "learning_rate": 4.308983818344313e-05, + "loss": 0.7308, + "step": 8556 + }, + { + "epoch": 0.1261009233254401, + "grad_norm": 1.2387797832489014, + "learning_rate": 4.3031769343747206e-05, + "loss": 0.7364, + "step": 8587 + }, + { + "epoch": 0.12655616131578465, + "grad_norm": 1.2150596380233765, + "learning_rate": 4.297349701798505e-05, + "loss": 0.7339, + "step": 8618 + }, + { + "epoch": 0.1270113993061292, + "grad_norm": 1.3160388469696045, + "learning_rate": 4.2915021863751916e-05, + "loss": 0.7448, + "step": 8649 + }, + { + "epoch": 0.12746663729647373, + "grad_norm": 1.2065149545669556, + "learning_rate": 4.285634454093198e-05, + "loss": 0.7373, + "step": 8680 + }, + { + "epoch": 0.12792187528681828, + "grad_norm": 1.2958154678344727, + "learning_rate": 4.279746571169086e-05, + "loss": 0.7282, + "step": 8711 + }, + { + "epoch": 0.12837711327716284, + "grad_norm": 1.2723522186279297, + "learning_rate": 4.2738386040468136e-05, + "loss": 0.7274, + "step": 8742 + }, + { + "epoch": 0.1288323512675074, + "grad_norm": 1.1709808111190796, + "learning_rate": 4.2679106193969866e-05, + "loss": 0.7459, + "step": 8773 + }, + { + "epoch": 0.12928758925785194, + "grad_norm": 1.3450511693954468, + "learning_rate": 4.261962684116106e-05, + "loss": 0.7334, + "step": 8804 + }, + { + "epoch": 0.1297428272481965, + "grad_norm": 1.3592984676361084, + "learning_rate": 4.2559948653258145e-05, + "loss": 0.7415, + "step": 8835 + }, + { + "epoch": 0.13019806523854102, + "grad_norm": 1.2315089702606201, + "learning_rate": 4.250007230372134e-05, + "loss": 0.7296, + "step": 8866 + }, + { + "epoch": 0.13065330322888558, + "grad_norm": 1.0693110227584839, + "learning_rate": 4.2439998468247126e-05, + "loss": 0.7269, + "step": 8897 + }, + { + "epoch": 0.13110854121923013, + "grad_norm": 1.0341922044754028, + "learning_rate": 4.2379727824760566e-05, + "loss": 0.7257, + "step": 8928 + }, + { + "epoch": 0.13156377920957468, + "grad_norm": 1.3500678539276123, + "learning_rate": 4.231926105340768e-05, + "loss": 0.7379, + "step": 8959 + }, + { + "epoch": 0.13201901719991924, + "grad_norm": 1.3482787609100342, + "learning_rate": 4.225859883654776e-05, + "loss": 0.7393, + "step": 8990 + }, + { + "epoch": 0.1324742551902638, + "grad_norm": 1.3126060962677002, + "learning_rate": 4.219774185874569e-05, + "loss": 0.7391, + "step": 9021 + }, + { + "epoch": 0.13292949318060834, + "grad_norm": 1.2438606023788452, + "learning_rate": 4.213669080676418e-05, + "loss": 0.7347, + "step": 9052 + }, + { + "epoch": 0.13338473117095287, + "grad_norm": 1.3756070137023926, + "learning_rate": 4.2075446369556056e-05, + "loss": 0.7321, + "step": 9083 + }, + { + "epoch": 0.13383996916129742, + "grad_norm": 1.189840316772461, + "learning_rate": 4.201400923825648e-05, + "loss": 0.7352, + "step": 9114 + }, + { + "epoch": 0.13429520715164198, + "grad_norm": 1.1982426643371582, + "learning_rate": 4.195238010617511e-05, + "loss": 0.7388, + "step": 9145 + }, + { + "epoch": 0.13475044514198653, + "grad_norm": 1.226491928100586, + "learning_rate": 4.1890559668788344e-05, + "loss": 0.7349, + "step": 9176 + }, + { + "epoch": 0.13520568313233108, + "grad_norm": 1.3030654191970825, + "learning_rate": 4.1828548623731405e-05, + "loss": 0.7371, + "step": 9207 + }, + { + "epoch": 0.13566092112267564, + "grad_norm": 1.17802894115448, + "learning_rate": 4.1766347670790506e-05, + "loss": 0.7431, + "step": 9238 + }, + { + "epoch": 0.13611615911302016, + "grad_norm": 1.2893658876419067, + "learning_rate": 4.170395751189495e-05, + "loss": 0.7258, + "step": 9269 + }, + { + "epoch": 0.13657139710336472, + "grad_norm": 1.193758487701416, + "learning_rate": 4.164137885110921e-05, + "loss": 0.7362, + "step": 9300 + }, + { + "epoch": 0.13702663509370927, + "grad_norm": 1.2750978469848633, + "learning_rate": 4.157861239462495e-05, + "loss": 0.7265, + "step": 9331 + }, + { + "epoch": 0.13748187308405382, + "grad_norm": 1.161464810371399, + "learning_rate": 4.1515658850753114e-05, + "loss": 0.7389, + "step": 9362 + }, + { + "epoch": 0.13793711107439838, + "grad_norm": 1.2749390602111816, + "learning_rate": 4.145251892991588e-05, + "loss": 0.7315, + "step": 9393 + }, + { + "epoch": 0.13839234906474293, + "grad_norm": 1.1729388236999512, + "learning_rate": 4.138919334463868e-05, + "loss": 0.7352, + "step": 9424 + }, + { + "epoch": 0.13884758705508746, + "grad_norm": 1.0987122058868408, + "learning_rate": 4.1325682809542124e-05, + "loss": 0.7347, + "step": 9455 + }, + { + "epoch": 0.139302825045432, + "grad_norm": 1.0855766534805298, + "learning_rate": 4.126198804133398e-05, + "loss": 0.7271, + "step": 9486 + }, + { + "epoch": 0.13975806303577656, + "grad_norm": 1.1782172918319702, + "learning_rate": 4.1198109758801055e-05, + "loss": 0.74, + "step": 9517 + }, + { + "epoch": 0.14021330102612112, + "grad_norm": 1.186962366104126, + "learning_rate": 4.113404868280107e-05, + "loss": 0.725, + "step": 9548 + }, + { + "epoch": 0.14066853901646567, + "grad_norm": 1.2993977069854736, + "learning_rate": 4.106980553625457e-05, + "loss": 0.7346, + "step": 9579 + }, + { + "epoch": 0.14112377700681022, + "grad_norm": 1.154897928237915, + "learning_rate": 4.100538104413674e-05, + "loss": 0.7348, + "step": 9610 + }, + { + "epoch": 0.14157901499715475, + "grad_norm": 1.203722357749939, + "learning_rate": 4.09407759334692e-05, + "loss": 0.7312, + "step": 9641 + }, + { + "epoch": 0.1420342529874993, + "grad_norm": 1.102858543395996, + "learning_rate": 4.087599093331186e-05, + "loss": 0.7227, + "step": 9672 + }, + { + "epoch": 0.14248949097784386, + "grad_norm": 1.2667406797409058, + "learning_rate": 4.081102677475462e-05, + "loss": 0.727, + "step": 9703 + }, + { + "epoch": 0.1429447289681884, + "grad_norm": 1.221291422843933, + "learning_rate": 4.0745884190909194e-05, + "loss": 0.7317, + "step": 9734 + }, + { + "epoch": 0.14339996695853297, + "grad_norm": 1.0426162481307983, + "learning_rate": 4.0680563916900796e-05, + "loss": 0.7299, + "step": 9765 + }, + { + "epoch": 0.14385520494887752, + "grad_norm": 1.2158912420272827, + "learning_rate": 4.0615066689859815e-05, + "loss": 0.7399, + "step": 9796 + }, + { + "epoch": 0.14431044293922204, + "grad_norm": 1.2689307928085327, + "learning_rate": 4.0549393248913584e-05, + "loss": 0.721, + "step": 9827 + }, + { + "epoch": 0.1447656809295666, + "grad_norm": 1.3168833255767822, + "learning_rate": 4.048354433517794e-05, + "loss": 0.7258, + "step": 9858 + }, + { + "epoch": 0.14522091891991115, + "grad_norm": 1.1966122388839722, + "learning_rate": 4.0417520691748916e-05, + "loss": 0.7375, + "step": 9889 + }, + { + "epoch": 0.1456761569102557, + "grad_norm": 1.1252835988998413, + "learning_rate": 4.035132306369438e-05, + "loss": 0.7254, + "step": 9920 + }, + { + "epoch": 0.14613139490060026, + "grad_norm": 1.186324119567871, + "learning_rate": 4.028495219804555e-05, + "loss": 0.7337, + "step": 9951 + }, + { + "epoch": 0.1465866328909448, + "grad_norm": 1.1606496572494507, + "learning_rate": 4.021840884378864e-05, + "loss": 0.7273, + "step": 9982 + }, + { + "epoch": 0.14704187088128934, + "grad_norm": 1.316298007965088, + "learning_rate": 4.015169375185633e-05, + "loss": 0.7276, + "step": 10013 + }, + { + "epoch": 0.1474971088716339, + "grad_norm": 1.224236249923706, + "learning_rate": 4.0084807675119396e-05, + "loss": 0.7328, + "step": 10044 + }, + { + "epoch": 0.14795234686197845, + "grad_norm": 1.227148175239563, + "learning_rate": 4.0017751368378106e-05, + "loss": 0.739, + "step": 10075 + }, + { + "epoch": 0.148407584852323, + "grad_norm": 1.1307867765426636, + "learning_rate": 3.995052558835377e-05, + "loss": 0.7324, + "step": 10106 + }, + { + "epoch": 0.14886282284266755, + "grad_norm": 1.2655375003814697, + "learning_rate": 3.988313109368017e-05, + "loss": 0.7263, + "step": 10137 + }, + { + "epoch": 0.1493180608330121, + "grad_norm": 1.1724634170532227, + "learning_rate": 3.981556864489504e-05, + "loss": 0.7289, + "step": 10168 + }, + { + "epoch": 0.14977329882335666, + "grad_norm": 1.140684723854065, + "learning_rate": 3.974783900443142e-05, + "loss": 0.7309, + "step": 10199 + }, + { + "epoch": 0.15022853681370119, + "grad_norm": 1.167183756828308, + "learning_rate": 3.9679942936609095e-05, + "loss": 0.729, + "step": 10230 + }, + { + "epoch": 0.15068377480404574, + "grad_norm": 1.2216546535491943, + "learning_rate": 3.961188120762596e-05, + "loss": 0.7288, + "step": 10261 + }, + { + "epoch": 0.1511390127943903, + "grad_norm": 1.227397084236145, + "learning_rate": 3.954365458554938e-05, + "loss": 0.7313, + "step": 10292 + }, + { + "epoch": 0.15159425078473485, + "grad_norm": 1.075441837310791, + "learning_rate": 3.947526384030751e-05, + "loss": 0.7306, + "step": 10323 + }, + { + "epoch": 0.1520494887750794, + "grad_norm": 1.1227167844772339, + "learning_rate": 3.9406709743680624e-05, + "loss": 0.7287, + "step": 10354 + }, + { + "epoch": 0.15250472676542395, + "grad_norm": 1.0665740966796875, + "learning_rate": 3.9337993069292366e-05, + "loss": 0.7246, + "step": 10385 + }, + { + "epoch": 0.15295996475576848, + "grad_norm": 1.250580072402954, + "learning_rate": 3.926911459260109e-05, + "loss": 0.7183, + "step": 10416 + }, + { + "epoch": 0.15341520274611303, + "grad_norm": 1.1809351444244385, + "learning_rate": 3.920007509089102e-05, + "loss": 0.7307, + "step": 10447 + }, + { + "epoch": 0.1538704407364576, + "grad_norm": 1.3934037685394287, + "learning_rate": 3.913087534326357e-05, + "loss": 0.7151, + "step": 10478 + }, + { + "epoch": 0.15432567872680214, + "grad_norm": 1.1272441148757935, + "learning_rate": 3.9061516130628475e-05, + "loss": 0.7309, + "step": 10509 + }, + { + "epoch": 0.1547809167171467, + "grad_norm": 1.2410191297531128, + "learning_rate": 3.8991998235695025e-05, + "loss": 0.7375, + "step": 10540 + }, + { + "epoch": 0.15523615470749125, + "grad_norm": 1.1426646709442139, + "learning_rate": 3.8922322442963224e-05, + "loss": 0.7342, + "step": 10571 + }, + { + "epoch": 0.15569139269783577, + "grad_norm": 1.2037526369094849, + "learning_rate": 3.885248953871491e-05, + "loss": 0.7329, + "step": 10602 + }, + { + "epoch": 0.15614663068818033, + "grad_norm": 1.2198660373687744, + "learning_rate": 3.8782500311004915e-05, + "loss": 0.7367, + "step": 10633 + }, + { + "epoch": 0.15660186867852488, + "grad_norm": 1.3129019737243652, + "learning_rate": 3.871235554965218e-05, + "loss": 0.7236, + "step": 10664 + }, + { + "epoch": 0.15705710666886943, + "grad_norm": 1.2182697057724, + "learning_rate": 3.864205604623078e-05, + "loss": 0.721, + "step": 10695 + }, + { + "epoch": 0.157512344659214, + "grad_norm": 1.1494821310043335, + "learning_rate": 3.857160259406107e-05, + "loss": 0.7223, + "step": 10726 + }, + { + "epoch": 0.15796758264955854, + "grad_norm": 1.0680466890335083, + "learning_rate": 3.8500995988200674e-05, + "loss": 0.7224, + "step": 10757 + }, + { + "epoch": 0.15842282063990307, + "grad_norm": 1.155052661895752, + "learning_rate": 3.843023702543556e-05, + "loss": 0.7268, + "step": 10788 + }, + { + "epoch": 0.15887805863024762, + "grad_norm": 1.1415716409683228, + "learning_rate": 3.8359326504270984e-05, + "loss": 0.7167, + "step": 10819 + }, + { + "epoch": 0.15933329662059217, + "grad_norm": 1.1486759185791016, + "learning_rate": 3.828826522492255e-05, + "loss": 0.7236, + "step": 10850 + }, + { + "epoch": 0.15978853461093673, + "grad_norm": 1.1916829347610474, + "learning_rate": 3.821705398930713e-05, + "loss": 0.7269, + "step": 10881 + }, + { + "epoch": 0.16024377260128128, + "grad_norm": 1.180760383605957, + "learning_rate": 3.814569360103385e-05, + "loss": 0.724, + "step": 10912 + }, + { + "epoch": 0.16069901059162583, + "grad_norm": 1.197651982307434, + "learning_rate": 3.807418486539499e-05, + "loss": 0.7265, + "step": 10943 + }, + { + "epoch": 0.16115424858197036, + "grad_norm": 1.1579455137252808, + "learning_rate": 3.80025285893569e-05, + "loss": 0.7247, + "step": 10974 + }, + { + "epoch": 0.16160948657231491, + "grad_norm": 1.1810616254806519, + "learning_rate": 3.793072558155093e-05, + "loss": 0.7256, + "step": 11005 + }, + { + "epoch": 0.16206472456265947, + "grad_norm": 1.1510332822799683, + "learning_rate": 3.785877665226426e-05, + "loss": 0.722, + "step": 11036 + }, + { + "epoch": 0.16251996255300402, + "grad_norm": 1.0499261617660522, + "learning_rate": 3.778668261343079e-05, + "loss": 0.7255, + "step": 11067 + }, + { + "epoch": 0.16297520054334858, + "grad_norm": 1.1509817838668823, + "learning_rate": 3.771444427862192e-05, + "loss": 0.7147, + "step": 11098 + }, + { + "epoch": 0.16343043853369313, + "grad_norm": 1.1638833284378052, + "learning_rate": 3.7642062463037465e-05, + "loss": 0.7187, + "step": 11129 + }, + { + "epoch": 0.16388567652403765, + "grad_norm": 1.024095058441162, + "learning_rate": 3.7569537983496373e-05, + "loss": 0.7244, + "step": 11160 + }, + { + "epoch": 0.1643409145143822, + "grad_norm": 1.1326078176498413, + "learning_rate": 3.749687165842753e-05, + "loss": 0.7221, + "step": 11191 + }, + { + "epoch": 0.16479615250472676, + "grad_norm": 1.1863468885421753, + "learning_rate": 3.7424064307860536e-05, + "loss": 0.725, + "step": 11222 + }, + { + "epoch": 0.16525139049507132, + "grad_norm": 1.2948099374771118, + "learning_rate": 3.735111675341645e-05, + "loss": 0.7249, + "step": 11253 + }, + { + "epoch": 0.16570662848541587, + "grad_norm": 1.1384631395339966, + "learning_rate": 3.7278029818298524e-05, + "loss": 0.7203, + "step": 11284 + }, + { + "epoch": 0.16616186647576042, + "grad_norm": 1.1335748434066772, + "learning_rate": 3.720480432728287e-05, + "loss": 0.7183, + "step": 11315 + }, + { + "epoch": 0.16661710446610495, + "grad_norm": 1.2514820098876953, + "learning_rate": 3.71314411067092e-05, + "loss": 0.7216, + "step": 11346 + }, + { + "epoch": 0.1670723424564495, + "grad_norm": 1.128739833831787, + "learning_rate": 3.70579409844715e-05, + "loss": 0.7234, + "step": 11377 + }, + { + "epoch": 0.16752758044679406, + "grad_norm": 1.152342677116394, + "learning_rate": 3.698430479000865e-05, + "loss": 0.7304, + "step": 11408 + }, + { + "epoch": 0.1679828184371386, + "grad_norm": 1.222424864768982, + "learning_rate": 3.691053335429509e-05, + "loss": 0.7164, + "step": 11439 + }, + { + "epoch": 0.16843805642748316, + "grad_norm": 1.2406268119812012, + "learning_rate": 3.683662750983147e-05, + "loss": 0.7157, + "step": 11470 + }, + { + "epoch": 0.16889329441782772, + "grad_norm": 1.1266759634017944, + "learning_rate": 3.676258809063518e-05, + "loss": 0.7228, + "step": 11501 + }, + { + "epoch": 0.16934853240817227, + "grad_norm": 1.1239662170410156, + "learning_rate": 3.6688415932231004e-05, + "loss": 0.7266, + "step": 11532 + }, + { + "epoch": 0.1698037703985168, + "grad_norm": 1.0813047885894775, + "learning_rate": 3.661411187164166e-05, + "loss": 0.7278, + "step": 11563 + }, + { + "epoch": 0.17025900838886135, + "grad_norm": 56.909889221191406, + "learning_rate": 3.65396767473784e-05, + "loss": 0.7163, + "step": 11594 + }, + { + "epoch": 0.1707142463792059, + "grad_norm": 1.0749647617340088, + "learning_rate": 3.6465111399431465e-05, + "loss": 0.7182, + "step": 11625 + }, + { + "epoch": 0.17116948436955046, + "grad_norm": 1.0705510377883911, + "learning_rate": 3.6390416669260674e-05, + "loss": 0.726, + "step": 11656 + }, + { + "epoch": 0.171624722359895, + "grad_norm": 1.1083126068115234, + "learning_rate": 3.63155933997859e-05, + "loss": 0.7262, + "step": 11687 + }, + { + "epoch": 0.17207996035023956, + "grad_norm": 1.2302770614624023, + "learning_rate": 3.624064243537758e-05, + "loss": 0.7229, + "step": 11718 + }, + { + "epoch": 0.1725351983405841, + "grad_norm": 1.240893006324768, + "learning_rate": 3.616556462184716e-05, + "loss": 0.7228, + "step": 11749 + }, + { + "epoch": 0.17299043633092864, + "grad_norm": 1.1897794008255005, + "learning_rate": 3.609036080643755e-05, + "loss": 0.7178, + "step": 11780 + }, + { + "epoch": 0.1734456743212732, + "grad_norm": 1.244950294494629, + "learning_rate": 3.60150318378136e-05, + "loss": 0.7244, + "step": 11811 + }, + { + "epoch": 0.17390091231161775, + "grad_norm": 1.1689528226852417, + "learning_rate": 3.5939578566052465e-05, + "loss": 0.7181, + "step": 11842 + }, + { + "epoch": 0.1743561503019623, + "grad_norm": 1.1300387382507324, + "learning_rate": 3.586400184263408e-05, + "loss": 0.7176, + "step": 11873 + }, + { + "epoch": 0.17481138829230686, + "grad_norm": 1.0884469747543335, + "learning_rate": 3.578830252043148e-05, + "loss": 0.7182, + "step": 11904 + }, + { + "epoch": 0.17526662628265138, + "grad_norm": 1.119352102279663, + "learning_rate": 3.571248145370125e-05, + "loss": 0.7223, + "step": 11935 + }, + { + "epoch": 0.17572186427299594, + "grad_norm": 1.0979810953140259, + "learning_rate": 3.5636539498073794e-05, + "loss": 0.7233, + "step": 11966 + }, + { + "epoch": 0.1761771022633405, + "grad_norm": 1.138571858406067, + "learning_rate": 3.556047751054378e-05, + "loss": 0.7279, + "step": 11997 + }, + { + "epoch": 0.17663234025368504, + "grad_norm": 1.2011150121688843, + "learning_rate": 3.548429634946039e-05, + "loss": 0.717, + "step": 12028 + }, + { + "epoch": 0.1770875782440296, + "grad_norm": 1.119472861289978, + "learning_rate": 3.540799687451768e-05, + "loss": 0.7227, + "step": 12059 + }, + { + "epoch": 0.17754281623437415, + "grad_norm": 1.089117169380188, + "learning_rate": 3.533157994674485e-05, + "loss": 0.7132, + "step": 12090 + }, + { + "epoch": 0.17799805422471868, + "grad_norm": 1.0937649011611938, + "learning_rate": 3.5255046428496546e-05, + "loss": 0.7285, + "step": 12121 + }, + { + "epoch": 0.17845329221506323, + "grad_norm": 1.0359808206558228, + "learning_rate": 3.517839718344311e-05, + "loss": 0.7178, + "step": 12152 + }, + { + "epoch": 0.17890853020540778, + "grad_norm": 1.1309690475463867, + "learning_rate": 3.510163307656086e-05, + "loss": 0.713, + "step": 12183 + }, + { + "epoch": 0.17936376819575234, + "grad_norm": 1.126597285270691, + "learning_rate": 3.5024754974122324e-05, + "loss": 0.7139, + "step": 12214 + }, + { + "epoch": 0.1798190061860969, + "grad_norm": 1.29429292678833, + "learning_rate": 3.494776374368643e-05, + "loss": 0.7261, + "step": 12245 + }, + { + "epoch": 0.18027424417644144, + "grad_norm": 1.1156132221221924, + "learning_rate": 3.4870660254088724e-05, + "loss": 0.7124, + "step": 12276 + }, + { + "epoch": 0.18072948216678597, + "grad_norm": 1.1548200845718384, + "learning_rate": 3.479344537543164e-05, + "loss": 0.7195, + "step": 12307 + }, + { + "epoch": 0.18118472015713052, + "grad_norm": 1.1318944692611694, + "learning_rate": 3.4716119979074565e-05, + "loss": 0.7215, + "step": 12338 + }, + { + "epoch": 0.18163995814747508, + "grad_norm": 1.1952061653137207, + "learning_rate": 3.463868493762412e-05, + "loss": 0.7159, + "step": 12369 + }, + { + "epoch": 0.18209519613781963, + "grad_norm": 1.1703433990478516, + "learning_rate": 3.456114112492418e-05, + "loss": 0.7145, + "step": 12400 + }, + { + "epoch": 0.18255043412816419, + "grad_norm": 1.065319538116455, + "learning_rate": 3.4483489416046164e-05, + "loss": 0.7108, + "step": 12431 + }, + { + "epoch": 0.18300567211850874, + "grad_norm": 1.278533697128296, + "learning_rate": 3.440573068727905e-05, + "loss": 0.7116, + "step": 12462 + }, + { + "epoch": 0.18346091010885326, + "grad_norm": 1.1642391681671143, + "learning_rate": 3.4327865816119495e-05, + "loss": 0.716, + "step": 12493 + }, + { + "epoch": 0.18391614809919782, + "grad_norm": 1.2108798027038574, + "learning_rate": 3.4249895681262025e-05, + "loss": 0.7194, + "step": 12524 + }, + { + "epoch": 0.18437138608954237, + "grad_norm": 1.085163950920105, + "learning_rate": 3.417182116258899e-05, + "loss": 0.7166, + "step": 12555 + }, + { + "epoch": 0.18482662407988693, + "grad_norm": 1.124064564704895, + "learning_rate": 3.409364314116074e-05, + "loss": 0.7218, + "step": 12586 + }, + { + "epoch": 0.18528186207023148, + "grad_norm": 1.1817457675933838, + "learning_rate": 3.401536249920559e-05, + "loss": 0.7161, + "step": 12617 + }, + { + "epoch": 0.18573710006057603, + "grad_norm": 1.1430065631866455, + "learning_rate": 3.393698012010998e-05, + "loss": 0.7223, + "step": 12648 + }, + { + "epoch": 0.18619233805092056, + "grad_norm": 1.151633620262146, + "learning_rate": 3.385849688840839e-05, + "loss": 0.7181, + "step": 12679 + }, + { + "epoch": 0.1866475760412651, + "grad_norm": 1.1847879886627197, + "learning_rate": 3.3779913689773414e-05, + "loss": 0.7144, + "step": 12710 + }, + { + "epoch": 0.18710281403160967, + "grad_norm": 1.0997257232666016, + "learning_rate": 3.370123141100578e-05, + "loss": 0.7179, + "step": 12741 + }, + { + "epoch": 0.18755805202195422, + "grad_norm": 1.3207893371582031, + "learning_rate": 3.3622450940024305e-05, + "loss": 0.7147, + "step": 12772 + }, + { + "epoch": 0.18801329001229877, + "grad_norm": 1.1414912939071655, + "learning_rate": 3.35435731658559e-05, + "loss": 0.7258, + "step": 12803 + }, + { + "epoch": 0.18846852800264333, + "grad_norm": 1.1708139181137085, + "learning_rate": 3.346459897862552e-05, + "loss": 0.7222, + "step": 12834 + }, + { + "epoch": 0.18892376599298788, + "grad_norm": 1.0472559928894043, + "learning_rate": 3.338552926954613e-05, + "loss": 0.7168, + "step": 12865 + }, + { + "epoch": 0.1893790039833324, + "grad_norm": 1.1117762327194214, + "learning_rate": 3.330636493090868e-05, + "loss": 0.7128, + "step": 12896 + }, + { + "epoch": 0.18983424197367696, + "grad_norm": 1.1351251602172852, + "learning_rate": 3.322710685607193e-05, + "loss": 0.72, + "step": 12927 + }, + { + "epoch": 0.1902894799640215, + "grad_norm": 1.1530694961547852, + "learning_rate": 3.314775593945251e-05, + "loss": 0.7133, + "step": 12958 + }, + { + "epoch": 0.19074471795436607, + "grad_norm": 1.1590766906738281, + "learning_rate": 3.3068313076514714e-05, + "loss": 0.7236, + "step": 12989 + }, + { + "epoch": 0.19119995594471062, + "grad_norm": 1.0657463073730469, + "learning_rate": 3.298877916376047e-05, + "loss": 0.7159, + "step": 13020 + }, + { + "epoch": 0.19165519393505517, + "grad_norm": 1.090234637260437, + "learning_rate": 3.290915509871915e-05, + "loss": 0.7249, + "step": 13051 + }, + { + "epoch": 0.1921104319253997, + "grad_norm": 1.1001083850860596, + "learning_rate": 3.282944177993753e-05, + "loss": 0.721, + "step": 13082 + }, + { + "epoch": 0.19256566991574425, + "grad_norm": 1.0616897344589233, + "learning_rate": 3.274964010696957e-05, + "loss": 0.7145, + "step": 13113 + }, + { + "epoch": 0.1930209079060888, + "grad_norm": 1.05698823928833, + "learning_rate": 3.266975098036629e-05, + "loss": 0.7106, + "step": 13144 + }, + { + "epoch": 0.19347614589643336, + "grad_norm": 1.0747356414794922, + "learning_rate": 3.258977530166562e-05, + "loss": 0.7199, + "step": 13175 + }, + { + "epoch": 0.1939313838867779, + "grad_norm": 1.112629771232605, + "learning_rate": 3.250971397338227e-05, + "loss": 0.7088, + "step": 13206 + }, + { + "epoch": 0.19438662187712247, + "grad_norm": 1.13507878780365, + "learning_rate": 3.2429567898997404e-05, + "loss": 0.7084, + "step": 13237 + }, + { + "epoch": 0.194841859867467, + "grad_norm": 1.137793779373169, + "learning_rate": 3.234933798294859e-05, + "loss": 0.7121, + "step": 13268 + }, + { + "epoch": 0.19529709785781155, + "grad_norm": 1.263027548789978, + "learning_rate": 3.2269025130619535e-05, + "loss": 0.7117, + "step": 13299 + }, + { + "epoch": 0.1957523358481561, + "grad_norm": 1.1799430847167969, + "learning_rate": 3.218863024832985e-05, + "loss": 0.7144, + "step": 13330 + }, + { + "epoch": 0.19620757383850065, + "grad_norm": 1.040297508239746, + "learning_rate": 3.2108154243324864e-05, + "loss": 0.7089, + "step": 13361 + }, + { + "epoch": 0.1966628118288452, + "grad_norm": 1.110816240310669, + "learning_rate": 3.2027598023765345e-05, + "loss": 0.7166, + "step": 13392 + }, + { + "epoch": 0.19711804981918976, + "grad_norm": 1.0399518013000488, + "learning_rate": 3.194696249871729e-05, + "loss": 0.7081, + "step": 13423 + }, + { + "epoch": 0.1975732878095343, + "grad_norm": 1.1946238279342651, + "learning_rate": 3.186624857814164e-05, + "loss": 0.7112, + "step": 13454 + }, + { + "epoch": 0.19802852579987884, + "grad_norm": 1.1289162635803223, + "learning_rate": 3.178545717288401e-05, + "loss": 0.7079, + "step": 13485 + }, + { + "epoch": 0.1984837637902234, + "grad_norm": 1.17491614818573, + "learning_rate": 3.170458919466444e-05, + "loss": 0.7117, + "step": 13516 + }, + { + "epoch": 0.19893900178056795, + "grad_norm": 1.1498847007751465, + "learning_rate": 3.1623645556067063e-05, + "loss": 0.7131, + "step": 13547 + }, + { + "epoch": 0.1993942397709125, + "grad_norm": 1.2516326904296875, + "learning_rate": 3.154262717052985e-05, + "loss": 0.7145, + "step": 13578 + }, + { + "epoch": 0.19984947776125705, + "grad_norm": 1.2112692594528198, + "learning_rate": 3.146153495233426e-05, + "loss": 0.7098, + "step": 13609 + }, + { + "epoch": 0.20030471575160158, + "grad_norm": 0.989539384841919, + "learning_rate": 3.1380369816594944e-05, + "loss": 0.7182, + "step": 13640 + }, + { + "epoch": 0.20075995374194613, + "grad_norm": 1.0849697589874268, + "learning_rate": 3.129913267924946e-05, + "loss": 0.7153, + "step": 13671 + }, + { + "epoch": 0.2012151917322907, + "grad_norm": 1.1133675575256348, + "learning_rate": 3.121782445704782e-05, + "loss": 0.7104, + "step": 13702 + }, + { + "epoch": 0.20167042972263524, + "grad_norm": 1.1086009740829468, + "learning_rate": 3.11364460675423e-05, + "loss": 0.7125, + "step": 13733 + }, + { + "epoch": 0.2021256677129798, + "grad_norm": 1.0160905122756958, + "learning_rate": 3.1054998429076934e-05, + "loss": 0.7055, + "step": 13764 + }, + { + "epoch": 0.20258090570332435, + "grad_norm": 1.0811957120895386, + "learning_rate": 3.097348246077728e-05, + "loss": 0.7105, + "step": 13795 + }, + { + "epoch": 0.20303614369366887, + "grad_norm": 1.0012198686599731, + "learning_rate": 3.0891899082539924e-05, + "loss": 0.7134, + "step": 13826 + }, + { + "epoch": 0.20349138168401343, + "grad_norm": 1.2298318147659302, + "learning_rate": 3.0810249215022233e-05, + "loss": 0.7054, + "step": 13857 + }, + { + "epoch": 0.20394661967435798, + "grad_norm": 1.1324481964111328, + "learning_rate": 3.0728533779631865e-05, + "loss": 0.7023, + "step": 13888 + }, + { + "epoch": 0.20440185766470254, + "grad_norm": 1.1454377174377441, + "learning_rate": 3.064675369851637e-05, + "loss": 0.7124, + "step": 13919 + }, + { + "epoch": 0.2048570956550471, + "grad_norm": 1.0461289882659912, + "learning_rate": 3.056490989455289e-05, + "loss": 0.7159, + "step": 13950 + }, + { + "epoch": 0.20531233364539164, + "grad_norm": 1.0020164251327515, + "learning_rate": 3.0483003291337596e-05, + "loss": 0.702, + "step": 13981 + }, + { + "epoch": 0.20576757163573617, + "grad_norm": 1.0952621698379517, + "learning_rate": 3.040103481317539e-05, + "loss": 0.7108, + "step": 14012 + }, + { + "epoch": 0.20622280962608072, + "grad_norm": 1.129813551902771, + "learning_rate": 3.03190053850694e-05, + "loss": 0.7043, + "step": 14043 + }, + { + "epoch": 0.20667804761642528, + "grad_norm": 1.012137770652771, + "learning_rate": 3.0236915932710573e-05, + "loss": 0.7002, + "step": 14074 + }, + { + "epoch": 0.20713328560676983, + "grad_norm": 1.1774550676345825, + "learning_rate": 3.0154767382467232e-05, + "loss": 0.6996, + "step": 14105 + }, + { + "epoch": 0.20758852359711438, + "grad_norm": 1.0878807306289673, + "learning_rate": 3.0072560661374582e-05, + "loss": 0.7075, + "step": 14136 + }, + { + "epoch": 0.20804376158745894, + "grad_norm": 1.1599992513656616, + "learning_rate": 2.999029669712431e-05, + "loss": 0.7165, + "step": 14167 + }, + { + "epoch": 0.2084989995778035, + "grad_norm": 1.184403419494629, + "learning_rate": 2.990797641805408e-05, + "loss": 0.709, + "step": 14198 + }, + { + "epoch": 0.20895423756814802, + "grad_norm": 1.0857961177825928, + "learning_rate": 2.982560075313704e-05, + "loss": 0.6986, + "step": 14229 + }, + { + "epoch": 0.20940947555849257, + "grad_norm": 1.0922378301620483, + "learning_rate": 2.9743170631971368e-05, + "loss": 0.7085, + "step": 14260 + }, + { + "epoch": 0.20986471354883712, + "grad_norm": 1.0843058824539185, + "learning_rate": 2.9660686984769792e-05, + "loss": 0.7114, + "step": 14291 + }, + { + "epoch": 0.21031995153918168, + "grad_norm": 1.2273341417312622, + "learning_rate": 2.9578150742349047e-05, + "loss": 0.7092, + "step": 14322 + }, + { + "epoch": 0.21077518952952623, + "grad_norm": 1.1041830778121948, + "learning_rate": 2.949556283611942e-05, + "loss": 0.7086, + "step": 14353 + }, + { + "epoch": 0.21123042751987078, + "grad_norm": 1.1019948720932007, + "learning_rate": 2.9412924198074206e-05, + "loss": 0.7095, + "step": 14384 + }, + { + "epoch": 0.2116856655102153, + "grad_norm": 1.1595929861068726, + "learning_rate": 2.9330235760779208e-05, + "loss": 0.7083, + "step": 14415 + }, + { + "epoch": 0.21214090350055986, + "grad_norm": 1.252021312713623, + "learning_rate": 2.9247498457362188e-05, + "loss": 0.7045, + "step": 14446 + }, + { + "epoch": 0.21259614149090442, + "grad_norm": 1.1469565629959106, + "learning_rate": 2.9164713221502373e-05, + "loss": 0.6995, + "step": 14477 + }, + { + "epoch": 0.21305137948124897, + "grad_norm": 1.1796189546585083, + "learning_rate": 2.9081880987419912e-05, + "loss": 0.7101, + "step": 14508 + }, + { + "epoch": 0.21350661747159352, + "grad_norm": 1.183271884918213, + "learning_rate": 2.8999002689865296e-05, + "loss": 0.7074, + "step": 14539 + }, + { + "epoch": 0.21396185546193808, + "grad_norm": 1.2489229440689087, + "learning_rate": 2.8916079264108852e-05, + "loss": 0.7126, + "step": 14570 + }, + { + "epoch": 0.2144170934522826, + "grad_norm": 1.059584379196167, + "learning_rate": 2.883311164593017e-05, + "loss": 0.7045, + "step": 14601 + }, + { + "epoch": 0.21487233144262716, + "grad_norm": 1.1565430164337158, + "learning_rate": 2.875010077160754e-05, + "loss": 0.7075, + "step": 14632 + }, + { + "epoch": 0.2153275694329717, + "grad_norm": 1.0368664264678955, + "learning_rate": 2.866704757790741e-05, + "loss": 0.7065, + "step": 14663 + }, + { + "epoch": 0.21578280742331626, + "grad_norm": 1.1395366191864014, + "learning_rate": 2.858395300207376e-05, + "loss": 0.7029, + "step": 14694 + }, + { + "epoch": 0.21623804541366082, + "grad_norm": 1.077816128730774, + "learning_rate": 2.8500817981817607e-05, + "loss": 0.7078, + "step": 14725 + }, + { + "epoch": 0.21669328340400537, + "grad_norm": 1.2020843029022217, + "learning_rate": 2.8417643455306336e-05, + "loss": 0.7011, + "step": 14756 + }, + { + "epoch": 0.2171485213943499, + "grad_norm": 1.0293958187103271, + "learning_rate": 2.8334430361153185e-05, + "loss": 0.7038, + "step": 14787 + }, + { + "epoch": 0.21760375938469445, + "grad_norm": 1.2163504362106323, + "learning_rate": 2.8251179638406612e-05, + "loss": 0.7029, + "step": 14818 + }, + { + "epoch": 0.218058997375039, + "grad_norm": 1.0222349166870117, + "learning_rate": 2.8167892226539704e-05, + "loss": 0.7084, + "step": 14849 + }, + { + "epoch": 0.21851423536538356, + "grad_norm": 1.0532034635543823, + "learning_rate": 2.8084569065439588e-05, + "loss": 0.7162, + "step": 14880 + }, + { + "epoch": 0.2189694733557281, + "grad_norm": 0.9981489181518555, + "learning_rate": 2.8001211095396807e-05, + "loss": 0.7088, + "step": 14911 + }, + { + "epoch": 0.21942471134607267, + "grad_norm": 1.0804964303970337, + "learning_rate": 2.791781925709473e-05, + "loss": 0.7035, + "step": 14942 + }, + { + "epoch": 0.2198799493364172, + "grad_norm": 1.1326287984848022, + "learning_rate": 2.7834394491598908e-05, + "loss": 0.708, + "step": 14973 + }, + { + "epoch": 0.22033518732676174, + "grad_norm": 1.1164413690567017, + "learning_rate": 2.7750937740346485e-05, + "loss": 0.7047, + "step": 15004 + }, + { + "epoch": 0.2207904253171063, + "grad_norm": 1.1455166339874268, + "learning_rate": 2.7667449945135564e-05, + "loss": 0.708, + "step": 15035 + }, + { + "epoch": 0.22124566330745085, + "grad_norm": 1.1329716444015503, + "learning_rate": 2.7583932048114557e-05, + "loss": 0.6923, + "step": 15066 + }, + { + "epoch": 0.2217009012977954, + "grad_norm": 1.3479692935943604, + "learning_rate": 2.7500384991771587e-05, + "loss": 0.7119, + "step": 15097 + }, + { + "epoch": 0.22215613928813996, + "grad_norm": 1.1160681247711182, + "learning_rate": 2.7416809718923825e-05, + "loss": 0.7031, + "step": 15128 + }, + { + "epoch": 0.22261137727848448, + "grad_norm": 1.1188467741012573, + "learning_rate": 2.7333207172706864e-05, + "loss": 0.7029, + "step": 15159 + }, + { + "epoch": 0.22306661526882904, + "grad_norm": 0.9910608530044556, + "learning_rate": 2.7249578296564088e-05, + "loss": 0.6996, + "step": 15190 + }, + { + "epoch": 0.2235218532591736, + "grad_norm": 1.100176453590393, + "learning_rate": 2.7165924034235973e-05, + "loss": 0.7027, + "step": 15221 + }, + { + "epoch": 0.22397709124951815, + "grad_norm": 1.1848573684692383, + "learning_rate": 2.708224532974953e-05, + "loss": 0.7111, + "step": 15252 + }, + { + "epoch": 0.2244323292398627, + "grad_norm": 1.1293883323669434, + "learning_rate": 2.6998543127407538e-05, + "loss": 0.7045, + "step": 15283 + }, + { + "epoch": 0.22488756723020725, + "grad_norm": 1.1224740743637085, + "learning_rate": 2.6914818371777988e-05, + "loss": 0.7113, + "step": 15314 + }, + { + "epoch": 0.2253428052205518, + "grad_norm": 1.2419655323028564, + "learning_rate": 2.6831072007683373e-05, + "loss": 0.7062, + "step": 15345 + }, + { + "epoch": 0.22579804321089633, + "grad_norm": 1.1906564235687256, + "learning_rate": 2.6747304980190018e-05, + "loss": 0.7112, + "step": 15376 + }, + { + "epoch": 0.22625328120124089, + "grad_norm": 1.0610102415084839, + "learning_rate": 2.6663518234597453e-05, + "loss": 0.7052, + "step": 15407 + }, + { + "epoch": 0.22670851919158544, + "grad_norm": 1.3254245519638062, + "learning_rate": 2.6579712716427696e-05, + "loss": 0.6975, + "step": 15438 + }, + { + "epoch": 0.22716375718193, + "grad_norm": 1.1128469705581665, + "learning_rate": 2.6495889371414652e-05, + "loss": 0.6993, + "step": 15469 + }, + { + "epoch": 0.22761899517227455, + "grad_norm": 1.0977287292480469, + "learning_rate": 2.6412049145493367e-05, + "loss": 0.7001, + "step": 15500 + }, + { + "epoch": 0.2280742331626191, + "grad_norm": 0.9699016213417053, + "learning_rate": 2.632819298478939e-05, + "loss": 0.7082, + "step": 15531 + }, + { + "epoch": 0.22852947115296363, + "grad_norm": 1.1493170261383057, + "learning_rate": 2.6244321835608105e-05, + "loss": 0.7019, + "step": 15562 + }, + { + "epoch": 0.22898470914330818, + "grad_norm": 1.1549670696258545, + "learning_rate": 2.6160436644424024e-05, + "loss": 0.7087, + "step": 15593 + }, + { + "epoch": 0.22943994713365273, + "grad_norm": 1.2285927534103394, + "learning_rate": 2.6076538357870133e-05, + "loss": 0.695, + "step": 15624 + }, + { + "epoch": 0.2298951851239973, + "grad_norm": 1.0625406503677368, + "learning_rate": 2.5992627922727196e-05, + "loss": 0.7072, + "step": 15655 + }, + { + "epoch": 0.23035042311434184, + "grad_norm": 1.2031610012054443, + "learning_rate": 2.5908706285913066e-05, + "loss": 0.6952, + "step": 15686 + }, + { + "epoch": 0.2308056611046864, + "grad_norm": 1.0590460300445557, + "learning_rate": 2.5824774394472008e-05, + "loss": 0.6981, + "step": 15717 + }, + { + "epoch": 0.23126089909503092, + "grad_norm": 1.0085610151290894, + "learning_rate": 2.5740833195563996e-05, + "loss": 0.7006, + "step": 15748 + }, + { + "epoch": 0.23171613708537547, + "grad_norm": 1.1644418239593506, + "learning_rate": 2.5656883636454067e-05, + "loss": 0.7023, + "step": 15779 + }, + { + "epoch": 0.23217137507572003, + "grad_norm": 1.0243310928344727, + "learning_rate": 2.557292666450159e-05, + "loss": 0.7106, + "step": 15810 + }, + { + "epoch": 0.23262661306606458, + "grad_norm": 1.0970982313156128, + "learning_rate": 2.5488963227149566e-05, + "loss": 0.7018, + "step": 15841 + }, + { + "epoch": 0.23308185105640913, + "grad_norm": 1.0774227380752563, + "learning_rate": 2.5404994271913983e-05, + "loss": 0.7058, + "step": 15872 + }, + { + "epoch": 0.2335370890467537, + "grad_norm": 1.2018071413040161, + "learning_rate": 2.5321020746373085e-05, + "loss": 0.7072, + "step": 15903 + }, + { + "epoch": 0.2339923270370982, + "grad_norm": 1.1195406913757324, + "learning_rate": 2.52370435981567e-05, + "loss": 0.6971, + "step": 15934 + }, + { + "epoch": 0.23444756502744277, + "grad_norm": 1.0707147121429443, + "learning_rate": 2.5153063774935533e-05, + "loss": 0.7005, + "step": 15965 + }, + { + "epoch": 0.23490280301778732, + "grad_norm": 1.0488064289093018, + "learning_rate": 2.506908222441045e-05, + "loss": 0.6994, + "step": 15996 + }, + { + "epoch": 0.23535804100813187, + "grad_norm": 1.0699859857559204, + "learning_rate": 2.498509989430187e-05, + "loss": 0.6931, + "step": 16027 + }, + { + "epoch": 0.23581327899847643, + "grad_norm": 1.0461689233779907, + "learning_rate": 2.4901117732338958e-05, + "loss": 0.7022, + "step": 16058 + }, + { + "epoch": 0.23626851698882098, + "grad_norm": 1.096604824066162, + "learning_rate": 2.481713668624899e-05, + "loss": 0.7043, + "step": 16089 + }, + { + "epoch": 0.2367237549791655, + "grad_norm": 1.0687739849090576, + "learning_rate": 2.4733157703746663e-05, + "loss": 0.7043, + "step": 16120 + }, + { + "epoch": 0.23717899296951006, + "grad_norm": 1.1307755708694458, + "learning_rate": 2.4649181732523392e-05, + "loss": 0.7059, + "step": 16151 + }, + { + "epoch": 0.23763423095985461, + "grad_norm": 1.0404301881790161, + "learning_rate": 2.4565209720236582e-05, + "loss": 0.6981, + "step": 16182 + }, + { + "epoch": 0.23808946895019917, + "grad_norm": 1.0836886167526245, + "learning_rate": 2.4481242614498975e-05, + "loss": 0.7145, + "step": 16213 + }, + { + "epoch": 0.23854470694054372, + "grad_norm": 1.0622589588165283, + "learning_rate": 2.439728136286796e-05, + "loss": 0.7069, + "step": 16244 + }, + { + "epoch": 0.23899994493088828, + "grad_norm": 1.1610299348831177, + "learning_rate": 2.4313326912834852e-05, + "loss": 0.7022, + "step": 16275 + }, + { + "epoch": 0.2394551829212328, + "grad_norm": 1.004273772239685, + "learning_rate": 2.4229380211814206e-05, + "loss": 0.6905, + "step": 16306 + }, + { + "epoch": 0.23991042091157735, + "grad_norm": 1.0684071779251099, + "learning_rate": 2.4145442207133124e-05, + "loss": 0.6977, + "step": 16337 + }, + { + "epoch": 0.2403656589019219, + "grad_norm": 0.9177312850952148, + "learning_rate": 2.406151384602059e-05, + "loss": 0.6979, + "step": 16368 + }, + { + "epoch": 0.24082089689226646, + "grad_norm": 1.0734107494354248, + "learning_rate": 2.3977596075596747e-05, + "loss": 0.6987, + "step": 16399 + }, + { + "epoch": 0.24127613488261102, + "grad_norm": 1.1414164304733276, + "learning_rate": 2.3893689842862223e-05, + "loss": 0.6927, + "step": 16430 + }, + { + "epoch": 0.24173137287295557, + "grad_norm": 1.1547383069992065, + "learning_rate": 2.3809796094687475e-05, + "loss": 0.7053, + "step": 16461 + }, + { + "epoch": 0.2421866108633001, + "grad_norm": 1.0909677743911743, + "learning_rate": 2.372591577780202e-05, + "loss": 0.6987, + "step": 16492 + }, + { + "epoch": 0.24264184885364465, + "grad_norm": 1.0706005096435547, + "learning_rate": 2.3642049838783838e-05, + "loss": 0.7014, + "step": 16523 + }, + { + "epoch": 0.2430970868439892, + "grad_norm": 1.0389344692230225, + "learning_rate": 2.3558199224048666e-05, + "loss": 0.695, + "step": 16554 + }, + { + "epoch": 0.24355232483433376, + "grad_norm": 1.0836538076400757, + "learning_rate": 2.347436487983929e-05, + "loss": 0.7004, + "step": 16585 + }, + { + "epoch": 0.2440075628246783, + "grad_norm": 1.0748459100723267, + "learning_rate": 2.3390547752214888e-05, + "loss": 0.7018, + "step": 16616 + }, + { + "epoch": 0.24446280081502286, + "grad_norm": 1.097935438156128, + "learning_rate": 2.330674878704035e-05, + "loss": 0.706, + "step": 16647 + }, + { + "epoch": 0.24491803880536742, + "grad_norm": 1.1082520484924316, + "learning_rate": 2.322296892997561e-05, + "loss": 0.7012, + "step": 16678 + }, + { + "epoch": 0.24537327679571194, + "grad_norm": 1.0682934522628784, + "learning_rate": 2.313920912646497e-05, + "loss": 0.701, + "step": 16709 + }, + { + "epoch": 0.2458285147860565, + "grad_norm": 1.1116893291473389, + "learning_rate": 2.305547032172643e-05, + "loss": 0.7038, + "step": 16740 + }, + { + "epoch": 0.24628375277640105, + "grad_norm": 1.0376949310302734, + "learning_rate": 2.2971753460741014e-05, + "loss": 0.6998, + "step": 16771 + }, + { + "epoch": 0.2467389907667456, + "grad_norm": 1.0389093160629272, + "learning_rate": 2.288805948824212e-05, + "loss": 0.7043, + "step": 16802 + }, + { + "epoch": 0.24719422875709016, + "grad_norm": 1.0645474195480347, + "learning_rate": 2.2804389348704858e-05, + "loss": 0.6947, + "step": 16833 + }, + { + "epoch": 0.2476494667474347, + "grad_norm": 1.0893995761871338, + "learning_rate": 2.2720743986335374e-05, + "loss": 0.6981, + "step": 16864 + }, + { + "epoch": 0.24810470473777924, + "grad_norm": 1.022275447845459, + "learning_rate": 2.2637124345060233e-05, + "loss": 0.7081, + "step": 16895 + }, + { + "epoch": 0.2485599427281238, + "grad_norm": 1.1055867671966553, + "learning_rate": 2.2553531368515695e-05, + "loss": 0.6942, + "step": 16926 + }, + { + "epoch": 0.24901518071846834, + "grad_norm": 1.0815192461013794, + "learning_rate": 2.2469966000037144e-05, + "loss": 0.6951, + "step": 16957 + }, + { + "epoch": 0.2494704187088129, + "grad_norm": 1.0612388849258423, + "learning_rate": 2.2386429182648417e-05, + "loss": 0.6991, + "step": 16988 + }, + { + "epoch": 0.24992565669915745, + "grad_norm": 1.0434961318969727, + "learning_rate": 2.230292185905114e-05, + "loss": 0.6904, + "step": 17019 + }, + { + "epoch": 0.250380894689502, + "grad_norm": 1.0427175760269165, + "learning_rate": 2.2219444971614116e-05, + "loss": 0.6979, + "step": 17050 + }, + { + "epoch": 0.25083613267984656, + "grad_norm": 1.0715687274932861, + "learning_rate": 2.2135999462362655e-05, + "loss": 0.7034, + "step": 17081 + }, + { + "epoch": 0.2512913706701911, + "grad_norm": 1.0116679668426514, + "learning_rate": 2.2052586272968003e-05, + "loss": 0.6996, + "step": 17112 + }, + { + "epoch": 0.25174660866053566, + "grad_norm": 1.2103781700134277, + "learning_rate": 2.196920634473666e-05, + "loss": 0.7026, + "step": 17143 + }, + { + "epoch": 0.2522018466508802, + "grad_norm": 1.0434819459915161, + "learning_rate": 2.1885860618599787e-05, + "loss": 0.6979, + "step": 17174 + }, + { + "epoch": 0.2526570846412247, + "grad_norm": 1.2911967039108276, + "learning_rate": 2.1802550035102577e-05, + "loss": 0.6866, + "step": 17205 + }, + { + "epoch": 0.2531123226315693, + "grad_norm": 1.1720303297042847, + "learning_rate": 2.171927553439363e-05, + "loss": 0.6868, + "step": 17236 + }, + { + "epoch": 0.2535675606219138, + "grad_norm": 1.0302678346633911, + "learning_rate": 2.1636038056214376e-05, + "loss": 0.691, + "step": 17267 + }, + { + "epoch": 0.2540227986122584, + "grad_norm": 1.0190601348876953, + "learning_rate": 2.155283853988844e-05, + "loss": 0.6964, + "step": 17298 + }, + { + "epoch": 0.25447803660260293, + "grad_norm": 1.109703540802002, + "learning_rate": 2.146967792431106e-05, + "loss": 0.693, + "step": 17329 + }, + { + "epoch": 0.25493327459294746, + "grad_norm": 1.160040020942688, + "learning_rate": 2.138655714793849e-05, + "loss": 0.6943, + "step": 17360 + }, + { + "epoch": 0.25538851258329204, + "grad_norm": 1.083268404006958, + "learning_rate": 2.1303477148777367e-05, + "loss": 0.7024, + "step": 17391 + }, + { + "epoch": 0.25584375057363656, + "grad_norm": 1.0631040334701538, + "learning_rate": 2.122043886437421e-05, + "loss": 0.6896, + "step": 17422 + }, + { + "epoch": 0.25629898856398114, + "grad_norm": 1.2141170501708984, + "learning_rate": 2.1137443231804765e-05, + "loss": 0.7005, + "step": 17453 + }, + { + "epoch": 0.25675422655432567, + "grad_norm": 1.082511067390442, + "learning_rate": 2.105449118766347e-05, + "loss": 0.6906, + "step": 17484 + }, + { + "epoch": 0.25720946454467025, + "grad_norm": 0.9919353127479553, + "learning_rate": 2.097158366805287e-05, + "loss": 0.7017, + "step": 17515 + }, + { + "epoch": 0.2576647025350148, + "grad_norm": 1.0450084209442139, + "learning_rate": 2.0888721608573047e-05, + "loss": 0.696, + "step": 17546 + }, + { + "epoch": 0.2581199405253593, + "grad_norm": 1.0460536479949951, + "learning_rate": 2.0805905944311087e-05, + "loss": 0.6947, + "step": 17577 + }, + { + "epoch": 0.2585751785157039, + "grad_norm": 1.0822510719299316, + "learning_rate": 2.0723137609830497e-05, + "loss": 0.7039, + "step": 17608 + }, + { + "epoch": 0.2590304165060484, + "grad_norm": 1.0411216020584106, + "learning_rate": 2.0640417539160686e-05, + "loss": 0.6962, + "step": 17639 + }, + { + "epoch": 0.259485654496393, + "grad_norm": 1.0115315914154053, + "learning_rate": 2.0557746665786427e-05, + "loss": 0.6954, + "step": 17670 + }, + { + "epoch": 0.2599408924867375, + "grad_norm": 1.0552514791488647, + "learning_rate": 2.0475125922637256e-05, + "loss": 0.6881, + "step": 17701 + }, + { + "epoch": 0.26039613047708204, + "grad_norm": 0.9966985583305359, + "learning_rate": 2.0392556242077047e-05, + "loss": 0.7012, + "step": 17732 + }, + { + "epoch": 0.2608513684674266, + "grad_norm": 1.113692045211792, + "learning_rate": 2.031003855589343e-05, + "loss": 0.703, + "step": 17763 + }, + { + "epoch": 0.26130660645777115, + "grad_norm": 1.0169728994369507, + "learning_rate": 2.022757379528727e-05, + "loss": 0.7008, + "step": 17794 + }, + { + "epoch": 0.26176184444811573, + "grad_norm": 1.1313414573669434, + "learning_rate": 2.0145162890862184e-05, + "loss": 0.6956, + "step": 17825 + }, + { + "epoch": 0.26221708243846026, + "grad_norm": 0.9456464052200317, + "learning_rate": 2.0062806772614022e-05, + "loss": 0.694, + "step": 17856 + }, + { + "epoch": 0.26267232042880484, + "grad_norm": 1.0825542211532593, + "learning_rate": 1.9980506369920392e-05, + "loss": 0.6915, + "step": 17887 + }, + { + "epoch": 0.26312755841914937, + "grad_norm": 1.059581995010376, + "learning_rate": 1.989826261153015e-05, + "loss": 0.6922, + "step": 17918 + }, + { + "epoch": 0.2635827964094939, + "grad_norm": 1.0134432315826416, + "learning_rate": 1.9816076425552923e-05, + "loss": 0.6952, + "step": 17949 + }, + { + "epoch": 0.2640380343998385, + "grad_norm": 0.9800439476966858, + "learning_rate": 1.9733948739448676e-05, + "loss": 0.7036, + "step": 17980 + }, + { + "epoch": 0.264493272390183, + "grad_norm": 1.128818392753601, + "learning_rate": 1.9651880480017155e-05, + "loss": 0.6916, + "step": 18011 + }, + { + "epoch": 0.2649485103805276, + "grad_norm": 1.0002161264419556, + "learning_rate": 1.9569872573387516e-05, + "loss": 0.6886, + "step": 18042 + }, + { + "epoch": 0.2654037483708721, + "grad_norm": 1.1037601232528687, + "learning_rate": 1.9487925945007854e-05, + "loss": 0.6954, + "step": 18073 + }, + { + "epoch": 0.2658589863612167, + "grad_norm": 1.0204657316207886, + "learning_rate": 1.9406041519634726e-05, + "loss": 0.6976, + "step": 18104 + }, + { + "epoch": 0.2663142243515612, + "grad_norm": 1.0254517793655396, + "learning_rate": 1.932422022132275e-05, + "loss": 0.697, + "step": 18135 + }, + { + "epoch": 0.26676946234190574, + "grad_norm": 1.0792242288589478, + "learning_rate": 1.924246297341414e-05, + "loss": 0.6932, + "step": 18166 + }, + { + "epoch": 0.2672247003322503, + "grad_norm": 1.2440094947814941, + "learning_rate": 1.9160770698528338e-05, + "loss": 0.6925, + "step": 18197 + }, + { + "epoch": 0.26767993832259485, + "grad_norm": 1.0181853771209717, + "learning_rate": 1.907914431855156e-05, + "loss": 0.6854, + "step": 18228 + }, + { + "epoch": 0.2681351763129394, + "grad_norm": 0.982681930065155, + "learning_rate": 1.8997584754626412e-05, + "loss": 0.6892, + "step": 18259 + }, + { + "epoch": 0.26859041430328395, + "grad_norm": 1.1587820053100586, + "learning_rate": 1.8916092927141486e-05, + "loss": 0.6955, + "step": 18290 + } + ], + "logging_steps": 31, + "max_steps": 30517, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 3052, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.3509936473079546e+19, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-18312/training_args.bin b/checkpoint-18312/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..dd0c30645908fd59ad9d6091c5e2e08018856b49 --- /dev/null +++ b/checkpoint-18312/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2d2ae676cb4770c8405bd6fc6962f1ef1289d669b20872c9a947682a2673db22 +size 5304 diff --git a/checkpoint-21364/config.json b/checkpoint-21364/config.json new file mode 100644 index 0000000000000000000000000000000000000000..1a2e378a7091e0c0c8eaa10d0bd7796835fe4b64 --- /dev/null +++ b/checkpoint-21364/config.json @@ -0,0 +1,39 @@ +{ + "_name_or_path": "meta-llama/Llama-3.1-8B-Instruct", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 8.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "torch_dtype": "float32", + "transformers_version": "4.44.0.dev0", + "use_cache": true, + "vocab_size": 128256 +} diff --git a/checkpoint-21364/generation_config.json b/checkpoint-21364/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c0d952a3697e90a24460d766a494a8bb9cf80534 --- /dev/null +++ b/checkpoint-21364/generation_config.json @@ -0,0 +1,12 @@ +{ + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.44.0.dev0" +} diff --git a/checkpoint-21364/model-00001-of-00007.safetensors b/checkpoint-21364/model-00001-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..deaddc13f5b047d011f16412b8fb01aea3faafa3 --- /dev/null +++ b/checkpoint-21364/model-00001-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88b5078103d18b3335317f2f2058d8f68828720e08c0fe8537128e3187ebdae4 +size 4886466168 diff --git a/checkpoint-21364/model-00002-of-00007.safetensors b/checkpoint-21364/model-00002-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..fbc8e97904ed5acdba6240901a52d7ea3c73628d --- /dev/null +++ b/checkpoint-21364/model-00002-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:23fa5aaf8ac56c17c821980c9aee44fcbc2a617fc570e875e6fdb93d92886c72 +size 4832007448 diff --git a/checkpoint-21364/model-00003-of-00007.safetensors b/checkpoint-21364/model-00003-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..aeab8b855338147792249baf9c680bfb29ea42a5 --- /dev/null +++ b/checkpoint-21364/model-00003-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a223e87240e50dfb05634ea909cf1ec715d37a12b681b14a3a6e97f28688f2b +size 4999813112 diff --git a/checkpoint-21364/model-00004-of-00007.safetensors b/checkpoint-21364/model-00004-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ac9e7b6b5edbe63eec2dd4c7d3310db1f10fdda9 --- /dev/null +++ b/checkpoint-21364/model-00004-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3047d199c611c6d12b1cca780f033ad89df9c076f742674f7ac4a892ca9573b3 +size 4999813128 diff --git a/checkpoint-21364/model-00005-of-00007.safetensors b/checkpoint-21364/model-00005-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..48718503a952853ffd5e231b9dba74b7026e0e01 --- /dev/null +++ b/checkpoint-21364/model-00005-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a2f91d749d664ecd22c71f3d54488966fc6899a03924b8fea3613c8f83398a8 +size 4832007496 diff --git a/checkpoint-21364/model-00006-of-00007.safetensors b/checkpoint-21364/model-00006-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3294703b951d3f1e65572dd47b916f2c60f0fe33 --- /dev/null +++ b/checkpoint-21364/model-00006-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1b9059878e614c5bb21569617e34ff95d5d5ed728186756067f68501ba80ea52 +size 4999813120 diff --git a/checkpoint-21364/model-00007-of-00007.safetensors b/checkpoint-21364/model-00007-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ec67af3d296c2c7a3f2c4a487cc847d867ed75fd --- /dev/null +++ b/checkpoint-21364/model-00007-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a1303be420f509aa196991f050ca5518ddf2bb2eea5afc93085e6718760ce33f +size 2571158184 diff --git a/checkpoint-21364/model.safetensors.index.json b/checkpoint-21364/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..0985084320f65a678bea622edb50d3771af4fd13 --- /dev/null +++ b/checkpoint-21364/model.safetensors.index.json @@ -0,0 +1,298 @@ +{ + "metadata": { + "total_size": 32121044992 + }, + "weight_map": { + "lm_head.weight": "model-00007-of-00007.safetensors", + "model.embed_tokens.weight": "model-00001-of-00007.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.10.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.15.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.20.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.21.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.26.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.3.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.30.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.input_layernorm.weight": "model-00007-of-00007.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00007-of-00007.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00007-of-00007.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00007-of-00007.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.4.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.9.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.norm.weight": "model-00007-of-00007.safetensors" + } +} diff --git a/checkpoint-21364/optimizer.pt b/checkpoint-21364/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..a795bfc8cf44b1981bc15aa29f917bbbe650a7d9 --- /dev/null +++ b/checkpoint-21364/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f01fd561e27569027a2fdf3e5f99228bfdfae69f73abc61ff621536eea4cbb3 +size 15385036334 diff --git a/checkpoint-21364/rng_state.pth b/checkpoint-21364/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..897ad19948758b40c785c678c136c4029433a27a --- /dev/null +++ b/checkpoint-21364/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d9cd6a0487226e5bd30d1846894c82af483733ab4381b75bae9c0745e05d405 +size 14244 diff --git a/checkpoint-21364/scheduler.pt b/checkpoint-21364/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..17783d26dc88c55a75e7564f8dcbad9eacfa9913 --- /dev/null +++ b/checkpoint-21364/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2827eb82750c76bd3279b469098a24605426f9a47a96b155384bcef2e3f4fe20 +size 1064 diff --git a/checkpoint-21364/trainer_state.json b/checkpoint-21364/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..946a309b43af5b310a36fcef3203c3a50bd2b290 --- /dev/null +++ b/checkpoint-21364/trainer_state.json @@ -0,0 +1,4856 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.31373240082970794, + "eval_steps": 500, + "global_step": 21364, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0004552379903445491, + "grad_norm": 4.0020222663879395, + "learning_rate": 1.0157273918741808e-06, + "loss": 0.92, + "step": 31 + }, + { + "epoch": 0.0009104759806890982, + "grad_norm": 3.06746506690979, + "learning_rate": 2.0314547837483616e-06, + "loss": 0.8898, + "step": 62 + }, + { + "epoch": 0.0013657139710336473, + "grad_norm": 3.004636764526367, + "learning_rate": 3.0471821756225426e-06, + "loss": 0.8653, + "step": 93 + }, + { + "epoch": 0.0018209519613781964, + "grad_norm": 3.0733370780944824, + "learning_rate": 4.062909567496723e-06, + "loss": 0.8516, + "step": 124 + }, + { + "epoch": 0.0022761899517227454, + "grad_norm": 2.898719549179077, + "learning_rate": 5.078636959370905e-06, + "loss": 0.8461, + "step": 155 + }, + { + "epoch": 0.0027314279420672946, + "grad_norm": 2.8273258209228516, + "learning_rate": 6.094364351245085e-06, + "loss": 0.8362, + "step": 186 + }, + { + "epoch": 0.0031866659324118435, + "grad_norm": 2.9983041286468506, + "learning_rate": 7.110091743119267e-06, + "loss": 0.8274, + "step": 217 + }, + { + "epoch": 0.0036419039227563927, + "grad_norm": 2.9561767578125, + "learning_rate": 8.125819134993446e-06, + "loss": 0.8373, + "step": 248 + }, + { + "epoch": 0.0040971419131009415, + "grad_norm": 2.732726812362671, + "learning_rate": 9.141546526867629e-06, + "loss": 0.8342, + "step": 279 + }, + { + "epoch": 0.004552379903445491, + "grad_norm": 2.8236358165740967, + "learning_rate": 1.015727391874181e-05, + "loss": 0.8387, + "step": 310 + }, + { + "epoch": 0.00500761789379004, + "grad_norm": 2.8730266094207764, + "learning_rate": 1.117300131061599e-05, + "loss": 0.8328, + "step": 341 + }, + { + "epoch": 0.005462855884134589, + "grad_norm": 2.717439651489258, + "learning_rate": 1.218872870249017e-05, + "loss": 0.8316, + "step": 372 + }, + { + "epoch": 0.005918093874479138, + "grad_norm": 2.9625961780548096, + "learning_rate": 1.3204456094364351e-05, + "loss": 0.8295, + "step": 403 + }, + { + "epoch": 0.006373331864823687, + "grad_norm": 2.6649422645568848, + "learning_rate": 1.4220183486238533e-05, + "loss": 0.8356, + "step": 434 + }, + { + "epoch": 0.006828569855168236, + "grad_norm": 2.9066264629364014, + "learning_rate": 1.5235910878112714e-05, + "loss": 0.8313, + "step": 465 + }, + { + "epoch": 0.0072838078455127854, + "grad_norm": 2.7710695266723633, + "learning_rate": 1.6251638269986893e-05, + "loss": 0.8251, + "step": 496 + }, + { + "epoch": 0.007739045835857335, + "grad_norm": 2.416724443435669, + "learning_rate": 1.7267365661861077e-05, + "loss": 0.8319, + "step": 527 + }, + { + "epoch": 0.008194283826201883, + "grad_norm": 2.583005905151367, + "learning_rate": 1.8283093053735257e-05, + "loss": 0.8265, + "step": 558 + }, + { + "epoch": 0.008649521816546433, + "grad_norm": 2.6601760387420654, + "learning_rate": 1.9298820445609438e-05, + "loss": 0.8328, + "step": 589 + }, + { + "epoch": 0.009104759806890982, + "grad_norm": 2.9271795749664307, + "learning_rate": 2.031454783748362e-05, + "loss": 0.8282, + "step": 620 + }, + { + "epoch": 0.00955999779723553, + "grad_norm": 2.3631091117858887, + "learning_rate": 2.13302752293578e-05, + "loss": 0.8314, + "step": 651 + }, + { + "epoch": 0.01001523578758008, + "grad_norm": 2.4002573490142822, + "learning_rate": 2.234600262123198e-05, + "loss": 0.829, + "step": 682 + }, + { + "epoch": 0.010470473777924628, + "grad_norm": 2.629352331161499, + "learning_rate": 2.336173001310616e-05, + "loss": 0.8315, + "step": 713 + }, + { + "epoch": 0.010925711768269179, + "grad_norm": 2.3604094982147217, + "learning_rate": 2.437745740498034e-05, + "loss": 0.8338, + "step": 744 + }, + { + "epoch": 0.011380949758613727, + "grad_norm": 2.8708078861236572, + "learning_rate": 2.5393184796854525e-05, + "loss": 0.843, + "step": 775 + }, + { + "epoch": 0.011836187748958275, + "grad_norm": 2.7403945922851562, + "learning_rate": 2.6408912188728702e-05, + "loss": 0.8372, + "step": 806 + }, + { + "epoch": 0.012291425739302825, + "grad_norm": 2.733816623687744, + "learning_rate": 2.7424639580602886e-05, + "loss": 0.8327, + "step": 837 + }, + { + "epoch": 0.012746663729647374, + "grad_norm": 2.2656960487365723, + "learning_rate": 2.8440366972477066e-05, + "loss": 0.8357, + "step": 868 + }, + { + "epoch": 0.013201901719991924, + "grad_norm": 2.18038010597229, + "learning_rate": 2.9456094364351244e-05, + "loss": 0.8331, + "step": 899 + }, + { + "epoch": 0.013657139710336472, + "grad_norm": 2.497558832168579, + "learning_rate": 3.0471821756225428e-05, + "loss": 0.8271, + "step": 930 + }, + { + "epoch": 0.01411237770068102, + "grad_norm": 2.322632074356079, + "learning_rate": 3.148754914809961e-05, + "loss": 0.8327, + "step": 961 + }, + { + "epoch": 0.014567615691025571, + "grad_norm": 2.5596141815185547, + "learning_rate": 3.2503276539973785e-05, + "loss": 0.8367, + "step": 992 + }, + { + "epoch": 0.01502285368137012, + "grad_norm": 2.1262409687042236, + "learning_rate": 3.351900393184797e-05, + "loss": 0.8192, + "step": 1023 + }, + { + "epoch": 0.01547809167171467, + "grad_norm": 2.2483584880828857, + "learning_rate": 3.453473132372215e-05, + "loss": 0.8352, + "step": 1054 + }, + { + "epoch": 0.015933329662059216, + "grad_norm": 2.224043130874634, + "learning_rate": 3.555045871559633e-05, + "loss": 0.8362, + "step": 1085 + }, + { + "epoch": 0.016388567652403766, + "grad_norm": 2.104788303375244, + "learning_rate": 3.6566186107470514e-05, + "loss": 0.832, + "step": 1116 + }, + { + "epoch": 0.016843805642748316, + "grad_norm": 2.126499652862549, + "learning_rate": 3.7581913499344695e-05, + "loss": 0.8341, + "step": 1147 + }, + { + "epoch": 0.017299043633092866, + "grad_norm": 2.221691370010376, + "learning_rate": 3.8597640891218876e-05, + "loss": 0.8346, + "step": 1178 + }, + { + "epoch": 0.017754281623437413, + "grad_norm": 2.4445159435272217, + "learning_rate": 3.9613368283093056e-05, + "loss": 0.8415, + "step": 1209 + }, + { + "epoch": 0.018209519613781963, + "grad_norm": 2.321516752243042, + "learning_rate": 4.062909567496724e-05, + "loss": 0.8289, + "step": 1240 + }, + { + "epoch": 0.018664757604126513, + "grad_norm": 2.2673656940460205, + "learning_rate": 4.164482306684142e-05, + "loss": 0.8307, + "step": 1271 + }, + { + "epoch": 0.01911999559447106, + "grad_norm": 2.1191513538360596, + "learning_rate": 4.26605504587156e-05, + "loss": 0.8319, + "step": 1302 + }, + { + "epoch": 0.01957523358481561, + "grad_norm": 2.18985652923584, + "learning_rate": 4.367627785058978e-05, + "loss": 0.8398, + "step": 1333 + }, + { + "epoch": 0.02003047157516016, + "grad_norm": 1.9449113607406616, + "learning_rate": 4.469200524246396e-05, + "loss": 0.8304, + "step": 1364 + }, + { + "epoch": 0.020485709565504707, + "grad_norm": 2.0119357109069824, + "learning_rate": 4.570773263433814e-05, + "loss": 0.82, + "step": 1395 + }, + { + "epoch": 0.020940947555849257, + "grad_norm": 2.0013489723205566, + "learning_rate": 4.672346002621232e-05, + "loss": 0.8275, + "step": 1426 + }, + { + "epoch": 0.021396185546193807, + "grad_norm": 2.0175704956054688, + "learning_rate": 4.77391874180865e-05, + "loss": 0.8348, + "step": 1457 + }, + { + "epoch": 0.021851423536538357, + "grad_norm": 2.0555551052093506, + "learning_rate": 4.875491480996068e-05, + "loss": 0.8322, + "step": 1488 + }, + { + "epoch": 0.022306661526882904, + "grad_norm": 1.9871079921722412, + "learning_rate": 4.977064220183487e-05, + "loss": 0.8253, + "step": 1519 + }, + { + "epoch": 0.022761899517227454, + "grad_norm": 2.1991310119628906, + "learning_rate": 4.9999915451558777e-05, + "loss": 0.8252, + "step": 1550 + }, + { + "epoch": 0.023217137507572004, + "grad_norm": 1.96790611743927, + "learning_rate": 4.999955597496219e-05, + "loss": 0.8242, + "step": 1581 + }, + { + "epoch": 0.02367237549791655, + "grad_norm": 2.0572164058685303, + "learning_rate": 4.9998914381774255e-05, + "loss": 0.833, + "step": 1612 + }, + { + "epoch": 0.0241276134882611, + "grad_norm": 2.0381035804748535, + "learning_rate": 4.999799067923527e-05, + "loss": 0.8351, + "step": 1643 + }, + { + "epoch": 0.02458285147860565, + "grad_norm": 2.145235061645508, + "learning_rate": 4.999678487776908e-05, + "loss": 0.842, + "step": 1674 + }, + { + "epoch": 0.025038089468950198, + "grad_norm": 2.0120856761932373, + "learning_rate": 4.9995296990983006e-05, + "loss": 0.838, + "step": 1705 + }, + { + "epoch": 0.025493327459294748, + "grad_norm": 1.9461947679519653, + "learning_rate": 4.999352703566763e-05, + "loss": 0.8269, + "step": 1736 + }, + { + "epoch": 0.025948565449639298, + "grad_norm": 1.9189060926437378, + "learning_rate": 4.999147503179668e-05, + "loss": 0.8214, + "step": 1767 + }, + { + "epoch": 0.026403803439983848, + "grad_norm": 1.906604290008545, + "learning_rate": 4.998914100252672e-05, + "loss": 0.8249, + "step": 1798 + }, + { + "epoch": 0.026859041430328395, + "grad_norm": 2.0810904502868652, + "learning_rate": 4.998652497419696e-05, + "loss": 0.8143, + "step": 1829 + }, + { + "epoch": 0.027314279420672945, + "grad_norm": 1.9607690572738647, + "learning_rate": 4.9983626976328927e-05, + "loss": 0.8155, + "step": 1860 + }, + { + "epoch": 0.027769517411017495, + "grad_norm": 1.8492732048034668, + "learning_rate": 4.998044704162613e-05, + "loss": 0.8168, + "step": 1891 + }, + { + "epoch": 0.02822475540136204, + "grad_norm": 2.0616824626922607, + "learning_rate": 4.9976985205973705e-05, + "loss": 0.8279, + "step": 1922 + }, + { + "epoch": 0.02867999339170659, + "grad_norm": 1.882591724395752, + "learning_rate": 4.997324150843799e-05, + "loss": 0.8199, + "step": 1953 + }, + { + "epoch": 0.029135231382051142, + "grad_norm": 1.8001904487609863, + "learning_rate": 4.99692159912661e-05, + "loss": 0.8232, + "step": 1984 + }, + { + "epoch": 0.02959046937239569, + "grad_norm": 1.94768226146698, + "learning_rate": 4.996490869988546e-05, + "loss": 0.817, + "step": 2015 + }, + { + "epoch": 0.03004570736274024, + "grad_norm": 1.845452904701233, + "learning_rate": 4.996031968290326e-05, + "loss": 0.8146, + "step": 2046 + }, + { + "epoch": 0.03050094535308479, + "grad_norm": 1.8649152517318726, + "learning_rate": 4.995544899210594e-05, + "loss": 0.8246, + "step": 2077 + }, + { + "epoch": 0.03095618334342934, + "grad_norm": 1.665781021118164, + "learning_rate": 4.9950296682458583e-05, + "loss": 0.8004, + "step": 2108 + }, + { + "epoch": 0.031411421333773885, + "grad_norm": 1.6481060981750488, + "learning_rate": 4.994486281210429e-05, + "loss": 0.7949, + "step": 2139 + }, + { + "epoch": 0.03186665932411843, + "grad_norm": 1.7481175661087036, + "learning_rate": 4.9939147442363566e-05, + "loss": 0.8142, + "step": 2170 + }, + { + "epoch": 0.032321897314462986, + "grad_norm": 1.6130069494247437, + "learning_rate": 4.9933150637733574e-05, + "loss": 0.8126, + "step": 2201 + }, + { + "epoch": 0.03277713530480753, + "grad_norm": 1.7536373138427734, + "learning_rate": 4.992687246588743e-05, + "loss": 0.8149, + "step": 2232 + }, + { + "epoch": 0.033232373295152086, + "grad_norm": 1.721200942993164, + "learning_rate": 4.992031299767347e-05, + "loss": 0.8051, + "step": 2263 + }, + { + "epoch": 0.03368761128549663, + "grad_norm": 3.3350446224212646, + "learning_rate": 4.9913472307114386e-05, + "loss": 0.8007, + "step": 2294 + }, + { + "epoch": 0.03414284927584118, + "grad_norm": 1.848281741142273, + "learning_rate": 4.9906350471406446e-05, + "loss": 0.8076, + "step": 2325 + }, + { + "epoch": 0.03459808726618573, + "grad_norm": 1.6283164024353027, + "learning_rate": 4.989894757091861e-05, + "loss": 0.8084, + "step": 2356 + }, + { + "epoch": 0.03505332525653028, + "grad_norm": 1.6567096710205078, + "learning_rate": 4.989126368919158e-05, + "loss": 0.8075, + "step": 2387 + }, + { + "epoch": 0.035508563246874826, + "grad_norm": 1.6488817930221558, + "learning_rate": 4.988329891293693e-05, + "loss": 0.7994, + "step": 2418 + }, + { + "epoch": 0.03596380123721938, + "grad_norm": 1.9465230703353882, + "learning_rate": 4.987505333203608e-05, + "loss": 0.8064, + "step": 2449 + }, + { + "epoch": 0.036419039227563926, + "grad_norm": 1.722517967224121, + "learning_rate": 4.9866527039539276e-05, + "loss": 0.8005, + "step": 2480 + }, + { + "epoch": 0.03687427721790847, + "grad_norm": 1.8509281873703003, + "learning_rate": 4.9857720131664594e-05, + "loss": 0.8094, + "step": 2511 + }, + { + "epoch": 0.03732951520825303, + "grad_norm": 1.7140249013900757, + "learning_rate": 4.9848632707796773e-05, + "loss": 0.8018, + "step": 2542 + }, + { + "epoch": 0.03778475319859757, + "grad_norm": 1.9315160512924194, + "learning_rate": 4.9839264870486155e-05, + "loss": 0.8094, + "step": 2573 + }, + { + "epoch": 0.03823999118894212, + "grad_norm": 1.728063702583313, + "learning_rate": 4.9829616725447526e-05, + "loss": 0.8052, + "step": 2604 + }, + { + "epoch": 0.03869522917928667, + "grad_norm": 1.588205099105835, + "learning_rate": 4.981968838155888e-05, + "loss": 0.7983, + "step": 2635 + }, + { + "epoch": 0.03915046716963122, + "grad_norm": 1.6086421012878418, + "learning_rate": 4.980947995086024e-05, + "loss": 0.7961, + "step": 2666 + }, + { + "epoch": 0.03960570515997577, + "grad_norm": 1.680635929107666, + "learning_rate": 4.979899154855234e-05, + "loss": 0.8033, + "step": 2697 + }, + { + "epoch": 0.04006094315032032, + "grad_norm": 1.6833465099334717, + "learning_rate": 4.9788223292995386e-05, + "loss": 0.793, + "step": 2728 + }, + { + "epoch": 0.04051618114066487, + "grad_norm": 1.5738922357559204, + "learning_rate": 4.977717530570768e-05, + "loss": 0.7837, + "step": 2759 + }, + { + "epoch": 0.040971419131009414, + "grad_norm": 1.6513015031814575, + "learning_rate": 4.976584771136425e-05, + "loss": 0.7969, + "step": 2790 + }, + { + "epoch": 0.04142665712135397, + "grad_norm": 1.7180182933807373, + "learning_rate": 4.975424063779547e-05, + "loss": 0.7994, + "step": 2821 + }, + { + "epoch": 0.041881895111698514, + "grad_norm": 1.5236577987670898, + "learning_rate": 4.974235421598557e-05, + "loss": 0.7952, + "step": 2852 + }, + { + "epoch": 0.04233713310204307, + "grad_norm": 1.760188341140747, + "learning_rate": 4.973018858007122e-05, + "loss": 0.795, + "step": 2883 + }, + { + "epoch": 0.042792371092387614, + "grad_norm": 1.5314075946807861, + "learning_rate": 4.9717743867339963e-05, + "loss": 0.7884, + "step": 2914 + }, + { + "epoch": 0.04324760908273216, + "grad_norm": 1.7256252765655518, + "learning_rate": 4.9705020218228695e-05, + "loss": 0.7883, + "step": 2945 + }, + { + "epoch": 0.043702847073076714, + "grad_norm": 1.6262825727462769, + "learning_rate": 4.969201777632205e-05, + "loss": 0.8007, + "step": 2976 + }, + { + "epoch": 0.04415808506342126, + "grad_norm": 1.4939130544662476, + "learning_rate": 4.9678736688350846e-05, + "loss": 0.7862, + "step": 3007 + }, + { + "epoch": 0.04461332305376581, + "grad_norm": 1.6776609420776367, + "learning_rate": 4.966517710419033e-05, + "loss": 0.7977, + "step": 3038 + }, + { + "epoch": 0.04506856104411036, + "grad_norm": 1.7425099611282349, + "learning_rate": 4.965133917685858e-05, + "loss": 0.789, + "step": 3069 + }, + { + "epoch": 0.04552379903445491, + "grad_norm": 1.5114233493804932, + "learning_rate": 4.9637223062514714e-05, + "loss": 0.7915, + "step": 3100 + }, + { + "epoch": 0.045979037024799455, + "grad_norm": 1.6397335529327393, + "learning_rate": 4.962282892045718e-05, + "loss": 0.8127, + "step": 3131 + }, + { + "epoch": 0.04643427501514401, + "grad_norm": 1.7266038656234741, + "learning_rate": 4.9608156913121904e-05, + "loss": 0.827, + "step": 3162 + }, + { + "epoch": 0.046889513005488555, + "grad_norm": 1.5216429233551025, + "learning_rate": 4.959320720608049e-05, + "loss": 0.8146, + "step": 3193 + }, + { + "epoch": 0.0473447509958331, + "grad_norm": 1.9235813617706299, + "learning_rate": 4.9577979968038354e-05, + "loss": 0.8118, + "step": 3224 + }, + { + "epoch": 0.047799988986177655, + "grad_norm": 1.6768368482589722, + "learning_rate": 4.956247537083282e-05, + "loss": 0.7925, + "step": 3255 + }, + { + "epoch": 0.0482552269765222, + "grad_norm": 1.6357700824737549, + "learning_rate": 4.9546693589431145e-05, + "loss": 0.7743, + "step": 3286 + }, + { + "epoch": 0.04871046496686675, + "grad_norm": 1.6823676824569702, + "learning_rate": 4.9530634801928595e-05, + "loss": 0.7949, + "step": 3317 + }, + { + "epoch": 0.0491657029572113, + "grad_norm": 1.4716497659683228, + "learning_rate": 4.9514299189546395e-05, + "loss": 0.7755, + "step": 3348 + }, + { + "epoch": 0.04962094094755585, + "grad_norm": 3.1948273181915283, + "learning_rate": 4.949768693662973e-05, + "loss": 0.7867, + "step": 3379 + }, + { + "epoch": 0.050076178937900395, + "grad_norm": 1.7412703037261963, + "learning_rate": 4.948079823064559e-05, + "loss": 0.788, + "step": 3410 + }, + { + "epoch": 0.05053141692824495, + "grad_norm": 1.5544873476028442, + "learning_rate": 4.946363326218074e-05, + "loss": 0.7752, + "step": 3441 + }, + { + "epoch": 0.050986654918589495, + "grad_norm": 1.5616456270217896, + "learning_rate": 4.9446192224939525e-05, + "loss": 0.7752, + "step": 3472 + }, + { + "epoch": 0.05144189290893404, + "grad_norm": 1.4451948404312134, + "learning_rate": 4.942847531574167e-05, + "loss": 0.7786, + "step": 3503 + }, + { + "epoch": 0.051897130899278596, + "grad_norm": 1.5473634004592896, + "learning_rate": 4.941048273452008e-05, + "loss": 0.7824, + "step": 3534 + }, + { + "epoch": 0.05235236888962314, + "grad_norm": 1.4772653579711914, + "learning_rate": 4.9392214684318605e-05, + "loss": 0.7873, + "step": 3565 + }, + { + "epoch": 0.052807606879967696, + "grad_norm": 1.4708329439163208, + "learning_rate": 4.93736713712897e-05, + "loss": 0.7851, + "step": 3596 + }, + { + "epoch": 0.05326284487031224, + "grad_norm": 1.469369649887085, + "learning_rate": 4.9354853004692124e-05, + "loss": 0.7874, + "step": 3627 + }, + { + "epoch": 0.05371808286065679, + "grad_norm": 1.3600234985351562, + "learning_rate": 4.93357597968886e-05, + "loss": 0.7792, + "step": 3658 + }, + { + "epoch": 0.05417332085100134, + "grad_norm": 1.6479856967926025, + "learning_rate": 4.931639196334338e-05, + "loss": 0.7886, + "step": 3689 + }, + { + "epoch": 0.05462855884134589, + "grad_norm": 1.5576198101043701, + "learning_rate": 4.9296749722619826e-05, + "loss": 0.7873, + "step": 3720 + }, + { + "epoch": 0.055083796831690436, + "grad_norm": 1.5949997901916504, + "learning_rate": 4.9276833296377966e-05, + "loss": 0.7794, + "step": 3751 + }, + { + "epoch": 0.05553903482203499, + "grad_norm": 1.3033227920532227, + "learning_rate": 4.925664290937196e-05, + "loss": 0.7763, + "step": 3782 + }, + { + "epoch": 0.055994272812379536, + "grad_norm": 1.5083190202713013, + "learning_rate": 4.9236178789447576e-05, + "loss": 0.7796, + "step": 3813 + }, + { + "epoch": 0.05644951080272408, + "grad_norm": 1.4691548347473145, + "learning_rate": 4.921544116753962e-05, + "loss": 0.7744, + "step": 3844 + }, + { + "epoch": 0.05690474879306864, + "grad_norm": 1.4447206258773804, + "learning_rate": 4.919443027766935e-05, + "loss": 0.7812, + "step": 3875 + }, + { + "epoch": 0.05735998678341318, + "grad_norm": 1.4616878032684326, + "learning_rate": 4.91731463569418e-05, + "loss": 0.7824, + "step": 3906 + }, + { + "epoch": 0.05781522477375773, + "grad_norm": 1.4808290004730225, + "learning_rate": 4.915158964554312e-05, + "loss": 0.7777, + "step": 3937 + }, + { + "epoch": 0.058270462764102283, + "grad_norm": 1.4712798595428467, + "learning_rate": 4.912976038673786e-05, + "loss": 0.7801, + "step": 3968 + }, + { + "epoch": 0.05872570075444683, + "grad_norm": 1.4899425506591797, + "learning_rate": 4.9107658826866254e-05, + "loss": 0.7852, + "step": 3999 + }, + { + "epoch": 0.05918093874479138, + "grad_norm": 1.2921983003616333, + "learning_rate": 4.908528521534139e-05, + "loss": 0.7735, + "step": 4030 + }, + { + "epoch": 0.05963617673513593, + "grad_norm": 1.3849194049835205, + "learning_rate": 4.906263980464644e-05, + "loss": 0.7893, + "step": 4061 + }, + { + "epoch": 0.06009141472548048, + "grad_norm": 1.442808747291565, + "learning_rate": 4.903972285033178e-05, + "loss": 0.784, + "step": 4092 + }, + { + "epoch": 0.060546652715825024, + "grad_norm": 1.4925036430358887, + "learning_rate": 4.901653461101213e-05, + "loss": 0.7714, + "step": 4123 + }, + { + "epoch": 0.06100189070616958, + "grad_norm": 1.4630024433135986, + "learning_rate": 4.8993075348363626e-05, + "loss": 0.7677, + "step": 4154 + }, + { + "epoch": 0.061457128696514124, + "grad_norm": 1.344081163406372, + "learning_rate": 4.896934532712084e-05, + "loss": 0.7788, + "step": 4185 + }, + { + "epoch": 0.06191236668685868, + "grad_norm": 1.449648380279541, + "learning_rate": 4.8945344815073846e-05, + "loss": 0.7661, + "step": 4216 + }, + { + "epoch": 0.062367604677203224, + "grad_norm": 1.450361728668213, + "learning_rate": 4.892107408306516e-05, + "loss": 0.7698, + "step": 4247 + }, + { + "epoch": 0.06282284266754777, + "grad_norm": 1.5068191289901733, + "learning_rate": 4.889653340498669e-05, + "loss": 0.7758, + "step": 4278 + }, + { + "epoch": 0.06327808065789232, + "grad_norm": 1.3638824224472046, + "learning_rate": 4.8871723057776664e-05, + "loss": 0.7702, + "step": 4309 + }, + { + "epoch": 0.06373331864823686, + "grad_norm": 1.4669525623321533, + "learning_rate": 4.8846643321416476e-05, + "loss": 0.7755, + "step": 4340 + }, + { + "epoch": 0.06418855663858142, + "grad_norm": 1.502684235572815, + "learning_rate": 4.882129447892753e-05, + "loss": 0.7677, + "step": 4371 + }, + { + "epoch": 0.06464379462892597, + "grad_norm": 1.5910828113555908, + "learning_rate": 4.8795676816368076e-05, + "loss": 0.7646, + "step": 4402 + }, + { + "epoch": 0.06509903261927051, + "grad_norm": 1.4607537984848022, + "learning_rate": 4.876979062282995e-05, + "loss": 0.7684, + "step": 4433 + }, + { + "epoch": 0.06555427060961506, + "grad_norm": 1.3823063373565674, + "learning_rate": 4.8743636190435325e-05, + "loss": 0.7695, + "step": 4464 + }, + { + "epoch": 0.06600950859995962, + "grad_norm": 1.3769381046295166, + "learning_rate": 4.871721381433344e-05, + "loss": 0.7746, + "step": 4495 + }, + { + "epoch": 0.06646474659030417, + "grad_norm": 1.3154246807098389, + "learning_rate": 4.869052379269719e-05, + "loss": 0.7699, + "step": 4526 + }, + { + "epoch": 0.06691998458064871, + "grad_norm": 1.4395617246627808, + "learning_rate": 4.866356642671985e-05, + "loss": 0.7654, + "step": 4557 + }, + { + "epoch": 0.06737522257099327, + "grad_norm": 1.333095908164978, + "learning_rate": 4.8636342020611634e-05, + "loss": 0.7729, + "step": 4588 + }, + { + "epoch": 0.06783046056133782, + "grad_norm": 1.3427667617797852, + "learning_rate": 4.860885088159626e-05, + "loss": 0.7701, + "step": 4619 + }, + { + "epoch": 0.06828569855168236, + "grad_norm": 1.4095017910003662, + "learning_rate": 4.858109331990751e-05, + "loss": 0.7711, + "step": 4650 + }, + { + "epoch": 0.06874093654202691, + "grad_norm": 1.4199680089950562, + "learning_rate": 4.855306964878567e-05, + "loss": 0.7797, + "step": 4681 + }, + { + "epoch": 0.06919617453237147, + "grad_norm": 1.3505098819732666, + "learning_rate": 4.8524780184474084e-05, + "loss": 0.7702, + "step": 4712 + }, + { + "epoch": 0.069651412522716, + "grad_norm": 1.3765984773635864, + "learning_rate": 4.8496225246215496e-05, + "loss": 0.7647, + "step": 4743 + }, + { + "epoch": 0.07010665051306056, + "grad_norm": 1.539703369140625, + "learning_rate": 4.8467405156248505e-05, + "loss": 0.7673, + "step": 4774 + }, + { + "epoch": 0.07056188850340511, + "grad_norm": 1.6388696432113647, + "learning_rate": 4.843832023980392e-05, + "loss": 0.7672, + "step": 4805 + }, + { + "epoch": 0.07101712649374965, + "grad_norm": 1.8714436292648315, + "learning_rate": 4.840897082510106e-05, + "loss": 0.7661, + "step": 4836 + }, + { + "epoch": 0.0714723644840942, + "grad_norm": 1.5175424814224243, + "learning_rate": 4.8379357243344084e-05, + "loss": 0.7709, + "step": 4867 + }, + { + "epoch": 0.07192760247443876, + "grad_norm": 1.3103501796722412, + "learning_rate": 4.8349479828718236e-05, + "loss": 0.7701, + "step": 4898 + }, + { + "epoch": 0.0723828404647833, + "grad_norm": 1.4692376852035522, + "learning_rate": 4.8319338918386075e-05, + "loss": 0.7602, + "step": 4929 + }, + { + "epoch": 0.07283807845512785, + "grad_norm": 1.390416145324707, + "learning_rate": 4.828893485248369e-05, + "loss": 0.7896, + "step": 4960 + }, + { + "epoch": 0.0732933164454724, + "grad_norm": 1.4157830476760864, + "learning_rate": 4.825826797411682e-05, + "loss": 0.7675, + "step": 4991 + }, + { + "epoch": 0.07374855443581695, + "grad_norm": 1.2405292987823486, + "learning_rate": 4.822733862935702e-05, + "loss": 0.7679, + "step": 5022 + }, + { + "epoch": 0.0742037924261615, + "grad_norm": 1.4902536869049072, + "learning_rate": 4.819614716723775e-05, + "loss": 0.7693, + "step": 5053 + }, + { + "epoch": 0.07465903041650605, + "grad_norm": 1.4391045570373535, + "learning_rate": 4.8164693939750425e-05, + "loss": 0.7699, + "step": 5084 + }, + { + "epoch": 0.07511426840685059, + "grad_norm": 1.4432806968688965, + "learning_rate": 4.813297930184042e-05, + "loss": 0.763, + "step": 5115 + }, + { + "epoch": 0.07556950639719515, + "grad_norm": 1.3630146980285645, + "learning_rate": 4.810100361140314e-05, + "loss": 0.7618, + "step": 5146 + }, + { + "epoch": 0.0760247443875397, + "grad_norm": 1.5038788318634033, + "learning_rate": 4.8068767229279885e-05, + "loss": 0.7685, + "step": 5177 + }, + { + "epoch": 0.07647998237788424, + "grad_norm": 1.4100326299667358, + "learning_rate": 4.8036270519253854e-05, + "loss": 0.7645, + "step": 5208 + }, + { + "epoch": 0.0769352203682288, + "grad_norm": 1.377821445465088, + "learning_rate": 4.8003513848046e-05, + "loss": 0.7643, + "step": 5239 + }, + { + "epoch": 0.07739045835857335, + "grad_norm": 1.3848881721496582, + "learning_rate": 4.79704975853109e-05, + "loss": 0.7711, + "step": 5270 + }, + { + "epoch": 0.07784569634891789, + "grad_norm": 1.6327494382858276, + "learning_rate": 4.793722210363262e-05, + "loss": 0.7591, + "step": 5301 + }, + { + "epoch": 0.07830093433926244, + "grad_norm": 1.51273512840271, + "learning_rate": 4.7903687778520414e-05, + "loss": 0.7697, + "step": 5332 + }, + { + "epoch": 0.078756172329607, + "grad_norm": 1.447487235069275, + "learning_rate": 4.7869894988404593e-05, + "loss": 0.7602, + "step": 5363 + }, + { + "epoch": 0.07921141031995153, + "grad_norm": 1.3227125406265259, + "learning_rate": 4.783584411463221e-05, + "loss": 0.7616, + "step": 5394 + }, + { + "epoch": 0.07966664831029609, + "grad_norm": 1.4407910108566284, + "learning_rate": 4.780153554146274e-05, + "loss": 0.7618, + "step": 5425 + }, + { + "epoch": 0.08012188630064064, + "grad_norm": 1.2349289655685425, + "learning_rate": 4.7766969656063766e-05, + "loss": 0.7688, + "step": 5456 + }, + { + "epoch": 0.08057712429098518, + "grad_norm": 1.3858132362365723, + "learning_rate": 4.773214684850662e-05, + "loss": 0.7517, + "step": 5487 + }, + { + "epoch": 0.08103236228132973, + "grad_norm": 1.2951774597167969, + "learning_rate": 4.769706751176193e-05, + "loss": 0.7588, + "step": 5518 + }, + { + "epoch": 0.08148760027167429, + "grad_norm": 1.2295372486114502, + "learning_rate": 4.7661732041695264e-05, + "loss": 0.7522, + "step": 5549 + }, + { + "epoch": 0.08194283826201883, + "grad_norm": 1.4258685111999512, + "learning_rate": 4.762614083706258e-05, + "loss": 0.7666, + "step": 5580 + }, + { + "epoch": 0.08239807625236338, + "grad_norm": 1.5323545932769775, + "learning_rate": 4.759029429950581e-05, + "loss": 0.7587, + "step": 5611 + }, + { + "epoch": 0.08285331424270793, + "grad_norm": 1.3712198734283447, + "learning_rate": 4.7554192833548235e-05, + "loss": 0.7597, + "step": 5642 + }, + { + "epoch": 0.08330855223305247, + "grad_norm": 1.389145016670227, + "learning_rate": 4.751783684659e-05, + "loss": 0.7484, + "step": 5673 + }, + { + "epoch": 0.08376379022339703, + "grad_norm": 1.466799020767212, + "learning_rate": 4.748122674890348e-05, + "loss": 0.7586, + "step": 5704 + }, + { + "epoch": 0.08421902821374158, + "grad_norm": 1.4522569179534912, + "learning_rate": 4.7444362953628654e-05, + "loss": 0.7615, + "step": 5735 + }, + { + "epoch": 0.08467426620408613, + "grad_norm": 1.6374691724777222, + "learning_rate": 4.7407245876768424e-05, + "loss": 0.7573, + "step": 5766 + }, + { + "epoch": 0.08512950419443067, + "grad_norm": 1.384748101234436, + "learning_rate": 4.736987593718397e-05, + "loss": 0.7516, + "step": 5797 + }, + { + "epoch": 0.08558474218477523, + "grad_norm": 1.366335153579712, + "learning_rate": 4.733225355658999e-05, + "loss": 0.7681, + "step": 5828 + }, + { + "epoch": 0.08603998017511978, + "grad_norm": 1.3855708837509155, + "learning_rate": 4.7294379159549926e-05, + "loss": 0.7557, + "step": 5859 + }, + { + "epoch": 0.08649521816546432, + "grad_norm": 1.3614453077316284, + "learning_rate": 4.725625317347119e-05, + "loss": 0.7624, + "step": 5890 + }, + { + "epoch": 0.08695045615580888, + "grad_norm": 1.2072994709014893, + "learning_rate": 4.7217876028600374e-05, + "loss": 0.7545, + "step": 5921 + }, + { + "epoch": 0.08740569414615343, + "grad_norm": 1.5530472993850708, + "learning_rate": 4.717924815801832e-05, + "loss": 0.7574, + "step": 5952 + }, + { + "epoch": 0.08786093213649797, + "grad_norm": 1.3522703647613525, + "learning_rate": 4.714036999763532e-05, + "loss": 0.7608, + "step": 5983 + }, + { + "epoch": 0.08831617012684252, + "grad_norm": 1.342637300491333, + "learning_rate": 4.7101241986186116e-05, + "loss": 0.7519, + "step": 6014 + }, + { + "epoch": 0.08877140811718708, + "grad_norm": 1.2783684730529785, + "learning_rate": 4.7061864565225e-05, + "loss": 0.7565, + "step": 6045 + }, + { + "epoch": 0.08922664610753162, + "grad_norm": 1.1245291233062744, + "learning_rate": 4.702223817912081e-05, + "loss": 0.7655, + "step": 6076 + }, + { + "epoch": 0.08968188409787617, + "grad_norm": 1.2777146100997925, + "learning_rate": 4.698236327505195e-05, + "loss": 0.7526, + "step": 6107 + }, + { + "epoch": 0.09013712208822072, + "grad_norm": 1.2739795446395874, + "learning_rate": 4.694224030300127e-05, + "loss": 0.7594, + "step": 6138 + }, + { + "epoch": 0.09059236007856526, + "grad_norm": 1.358684778213501, + "learning_rate": 4.690186971575107e-05, + "loss": 0.7599, + "step": 6169 + }, + { + "epoch": 0.09104759806890982, + "grad_norm": 1.2908906936645508, + "learning_rate": 4.6861251968877916e-05, + "loss": 0.7564, + "step": 6200 + }, + { + "epoch": 0.09150283605925437, + "grad_norm": 1.2198729515075684, + "learning_rate": 4.68203875207476e-05, + "loss": 0.7595, + "step": 6231 + }, + { + "epoch": 0.09195807404959891, + "grad_norm": 1.337134599685669, + "learning_rate": 4.677927683250983e-05, + "loss": 0.7512, + "step": 6262 + }, + { + "epoch": 0.09241331203994346, + "grad_norm": 1.4034923315048218, + "learning_rate": 4.6737920368093156e-05, + "loss": 0.7587, + "step": 6293 + }, + { + "epoch": 0.09286855003028802, + "grad_norm": 1.2584336996078491, + "learning_rate": 4.669631859419965e-05, + "loss": 0.7532, + "step": 6324 + }, + { + "epoch": 0.09332378802063256, + "grad_norm": 1.4327120780944824, + "learning_rate": 4.6654471980299676e-05, + "loss": 0.7528, + "step": 6355 + }, + { + "epoch": 0.09377902601097711, + "grad_norm": 1.386634111404419, + "learning_rate": 4.661238099862658e-05, + "loss": 0.7455, + "step": 6386 + }, + { + "epoch": 0.09423426400132166, + "grad_norm": 1.373677372932434, + "learning_rate": 4.657004612417138e-05, + "loss": 0.7519, + "step": 6417 + }, + { + "epoch": 0.0946895019916662, + "grad_norm": 1.3846200704574585, + "learning_rate": 4.6527467834677374e-05, + "loss": 0.7505, + "step": 6448 + }, + { + "epoch": 0.09514473998201076, + "grad_norm": 1.3237924575805664, + "learning_rate": 4.648464661063478e-05, + "loss": 0.7562, + "step": 6479 + }, + { + "epoch": 0.09559997797235531, + "grad_norm": 1.3368539810180664, + "learning_rate": 4.6441582935275264e-05, + "loss": 0.7523, + "step": 6510 + }, + { + "epoch": 0.09605521596269985, + "grad_norm": 1.1844121217727661, + "learning_rate": 4.6398277294566586e-05, + "loss": 0.7535, + "step": 6541 + }, + { + "epoch": 0.0965104539530444, + "grad_norm": 1.3107521533966064, + "learning_rate": 4.6354730177207e-05, + "loss": 0.7609, + "step": 6572 + }, + { + "epoch": 0.09696569194338896, + "grad_norm": 1.3678165674209595, + "learning_rate": 4.6310942074619787e-05, + "loss": 0.7513, + "step": 6603 + }, + { + "epoch": 0.0974209299337335, + "grad_norm": 1.2169839143753052, + "learning_rate": 4.626691348094777e-05, + "loss": 0.7464, + "step": 6634 + }, + { + "epoch": 0.09787616792407805, + "grad_norm": 1.5456466674804688, + "learning_rate": 4.622264489304762e-05, + "loss": 0.7431, + "step": 6665 + }, + { + "epoch": 0.0983314059144226, + "grad_norm": 1.3606510162353516, + "learning_rate": 4.617813681048434e-05, + "loss": 0.7481, + "step": 6696 + }, + { + "epoch": 0.09878664390476714, + "grad_norm": 1.1854835748672485, + "learning_rate": 4.61333897355256e-05, + "loss": 0.7491, + "step": 6727 + }, + { + "epoch": 0.0992418818951117, + "grad_norm": 1.1682158708572388, + "learning_rate": 4.608840417313604e-05, + "loss": 0.7433, + "step": 6758 + }, + { + "epoch": 0.09969711988545625, + "grad_norm": 1.3889473676681519, + "learning_rate": 4.6043180630971646e-05, + "loss": 0.7414, + "step": 6789 + }, + { + "epoch": 0.10015235787580079, + "grad_norm": 1.351807713508606, + "learning_rate": 4.599771961937391e-05, + "loss": 0.7501, + "step": 6820 + }, + { + "epoch": 0.10060759586614534, + "grad_norm": 1.4737526178359985, + "learning_rate": 4.5952021651364204e-05, + "loss": 0.7508, + "step": 6851 + }, + { + "epoch": 0.1010628338564899, + "grad_norm": 1.3223670721054077, + "learning_rate": 4.590608724263786e-05, + "loss": 0.7557, + "step": 6882 + }, + { + "epoch": 0.10151807184683444, + "grad_norm": 1.182531714439392, + "learning_rate": 4.585991691155845e-05, + "loss": 0.7595, + "step": 6913 + }, + { + "epoch": 0.10197330983717899, + "grad_norm": 1.3309706449508667, + "learning_rate": 4.581351117915188e-05, + "loss": 0.7516, + "step": 6944 + }, + { + "epoch": 0.10242854782752354, + "grad_norm": 1.1631003618240356, + "learning_rate": 4.5766870569100534e-05, + "loss": 0.7538, + "step": 6975 + }, + { + "epoch": 0.10288378581786808, + "grad_norm": 1.150242805480957, + "learning_rate": 4.571999560773736e-05, + "loss": 0.7583, + "step": 7006 + }, + { + "epoch": 0.10333902380821264, + "grad_norm": 1.2031753063201904, + "learning_rate": 4.5672886824039915e-05, + "loss": 0.743, + "step": 7037 + }, + { + "epoch": 0.10379426179855719, + "grad_norm": 1.3973835706710815, + "learning_rate": 4.5625544749624435e-05, + "loss": 0.7477, + "step": 7068 + }, + { + "epoch": 0.10424949978890174, + "grad_norm": 1.3755607604980469, + "learning_rate": 4.5577969918739794e-05, + "loss": 0.7424, + "step": 7099 + }, + { + "epoch": 0.10470473777924628, + "grad_norm": 1.2959696054458618, + "learning_rate": 4.5530162868261486e-05, + "loss": 0.7432, + "step": 7130 + }, + { + "epoch": 0.10515997576959084, + "grad_norm": 1.4843213558197021, + "learning_rate": 4.548212413768558e-05, + "loss": 0.7519, + "step": 7161 + }, + { + "epoch": 0.10561521375993539, + "grad_norm": 1.3697874546051025, + "learning_rate": 4.543385426912261e-05, + "loss": 0.7546, + "step": 7192 + }, + { + "epoch": 0.10607045175027993, + "grad_norm": 1.3163423538208008, + "learning_rate": 4.53853538072915e-05, + "loss": 0.7328, + "step": 7223 + }, + { + "epoch": 0.10652568974062449, + "grad_norm": 1.200897455215454, + "learning_rate": 4.533662329951336e-05, + "loss": 0.7438, + "step": 7254 + }, + { + "epoch": 0.10698092773096904, + "grad_norm": 1.3377361297607422, + "learning_rate": 4.528766329570536e-05, + "loss": 0.7478, + "step": 7285 + }, + { + "epoch": 0.10743616572131358, + "grad_norm": 1.2304787635803223, + "learning_rate": 4.523847434837447e-05, + "loss": 0.7468, + "step": 7316 + }, + { + "epoch": 0.10789140371165813, + "grad_norm": 1.1947201490402222, + "learning_rate": 4.518905701261128e-05, + "loss": 0.7483, + "step": 7347 + }, + { + "epoch": 0.10834664170200269, + "grad_norm": 1.1942620277404785, + "learning_rate": 4.5139411846083715e-05, + "loss": 0.7522, + "step": 7378 + }, + { + "epoch": 0.10880187969234723, + "grad_norm": 1.2765998840332031, + "learning_rate": 4.508953940903073e-05, + "loss": 0.7425, + "step": 7409 + }, + { + "epoch": 0.10925711768269178, + "grad_norm": 1.1460707187652588, + "learning_rate": 4.5039440264255994e-05, + "loss": 0.7534, + "step": 7440 + }, + { + "epoch": 0.10971235567303633, + "grad_norm": 1.1739740371704102, + "learning_rate": 4.498911497712155e-05, + "loss": 0.7447, + "step": 7471 + }, + { + "epoch": 0.11016759366338087, + "grad_norm": 1.227232575416565, + "learning_rate": 4.493856411554142e-05, + "loss": 0.7452, + "step": 7502 + }, + { + "epoch": 0.11062283165372543, + "grad_norm": 1.3812710046768188, + "learning_rate": 4.4887788249975206e-05, + "loss": 0.7431, + "step": 7533 + }, + { + "epoch": 0.11107806964406998, + "grad_norm": 1.1743565797805786, + "learning_rate": 4.4836787953421656e-05, + "loss": 0.7421, + "step": 7564 + }, + { + "epoch": 0.11153330763441452, + "grad_norm": 1.2436307668685913, + "learning_rate": 4.478556380141218e-05, + "loss": 0.7461, + "step": 7595 + }, + { + "epoch": 0.11198854562475907, + "grad_norm": 1.2472264766693115, + "learning_rate": 4.4734116372004375e-05, + "loss": 0.7456, + "step": 7626 + }, + { + "epoch": 0.11244378361510363, + "grad_norm": 1.2209473848342896, + "learning_rate": 4.4682446245775477e-05, + "loss": 0.7544, + "step": 7657 + }, + { + "epoch": 0.11289902160544817, + "grad_norm": 1.3582866191864014, + "learning_rate": 4.463055400581586e-05, + "loss": 0.7464, + "step": 7688 + }, + { + "epoch": 0.11335425959579272, + "grad_norm": 1.2489479780197144, + "learning_rate": 4.4578440237722374e-05, + "loss": 0.7394, + "step": 7719 + }, + { + "epoch": 0.11380949758613727, + "grad_norm": 1.2384752035140991, + "learning_rate": 4.452610552959183e-05, + "loss": 0.7358, + "step": 7750 + }, + { + "epoch": 0.11426473557648181, + "grad_norm": 1.3618046045303345, + "learning_rate": 4.447355047201428e-05, + "loss": 0.742, + "step": 7781 + }, + { + "epoch": 0.11471997356682637, + "grad_norm": 1.2598398923873901, + "learning_rate": 4.4420775658066414e-05, + "loss": 0.7407, + "step": 7812 + }, + { + "epoch": 0.11517521155717092, + "grad_norm": 1.1453444957733154, + "learning_rate": 4.436778168330484e-05, + "loss": 0.7499, + "step": 7843 + }, + { + "epoch": 0.11563044954751546, + "grad_norm": 1.3463783264160156, + "learning_rate": 4.4314569145759353e-05, + "loss": 0.7388, + "step": 7874 + }, + { + "epoch": 0.11608568753786001, + "grad_norm": 1.2431267499923706, + "learning_rate": 4.42611386459262e-05, + "loss": 0.7487, + "step": 7905 + }, + { + "epoch": 0.11654092552820457, + "grad_norm": 1.2726207971572876, + "learning_rate": 4.420749078676133e-05, + "loss": 0.7468, + "step": 7936 + }, + { + "epoch": 0.1169961635185491, + "grad_norm": 1.1979873180389404, + "learning_rate": 4.4153626173673516e-05, + "loss": 0.7482, + "step": 7967 + }, + { + "epoch": 0.11745140150889366, + "grad_norm": 1.2321964502334595, + "learning_rate": 4.409954541451762e-05, + "loss": 0.7363, + "step": 7998 + }, + { + "epoch": 0.11790663949923821, + "grad_norm": 1.265199065208435, + "learning_rate": 4.404524911958764e-05, + "loss": 0.7466, + "step": 8029 + }, + { + "epoch": 0.11836187748958275, + "grad_norm": 1.3418115377426147, + "learning_rate": 4.399073790160989e-05, + "loss": 0.7433, + "step": 8060 + }, + { + "epoch": 0.11881711547992731, + "grad_norm": 1.276666522026062, + "learning_rate": 4.393601237573607e-05, + "loss": 0.7457, + "step": 8091 + }, + { + "epoch": 0.11927235347027186, + "grad_norm": 1.3559373617172241, + "learning_rate": 4.388107315953628e-05, + "loss": 0.7451, + "step": 8122 + }, + { + "epoch": 0.1197275914606164, + "grad_norm": 1.3017044067382812, + "learning_rate": 4.382592087299212e-05, + "loss": 0.736, + "step": 8153 + }, + { + "epoch": 0.12018282945096095, + "grad_norm": 1.230873703956604, + "learning_rate": 4.377055613848964e-05, + "loss": 0.7409, + "step": 8184 + }, + { + "epoch": 0.12063806744130551, + "grad_norm": 1.3415979146957397, + "learning_rate": 4.3714979580812355e-05, + "loss": 0.7456, + "step": 8215 + }, + { + "epoch": 0.12109330543165005, + "grad_norm": 1.3706104755401611, + "learning_rate": 4.365919182713416e-05, + "loss": 0.7425, + "step": 8246 + }, + { + "epoch": 0.1215485434219946, + "grad_norm": 1.3071256875991821, + "learning_rate": 4.360319350701226e-05, + "loss": 0.7312, + "step": 8277 + }, + { + "epoch": 0.12200378141233915, + "grad_norm": 1.1925092935562134, + "learning_rate": 4.3546985252380115e-05, + "loss": 0.7457, + "step": 8308 + }, + { + "epoch": 0.12245901940268371, + "grad_norm": 1.2254852056503296, + "learning_rate": 4.349056769754021e-05, + "loss": 0.7454, + "step": 8339 + }, + { + "epoch": 0.12291425739302825, + "grad_norm": 1.2121927738189697, + "learning_rate": 4.3433941479156994e-05, + "loss": 0.7451, + "step": 8370 + }, + { + "epoch": 0.1233694953833728, + "grad_norm": 1.4518349170684814, + "learning_rate": 4.3377107236249647e-05, + "loss": 0.7376, + "step": 8401 + }, + { + "epoch": 0.12382473337371736, + "grad_norm": 1.2487318515777588, + "learning_rate": 4.332006561018488e-05, + "loss": 0.7468, + "step": 8432 + }, + { + "epoch": 0.1242799713640619, + "grad_norm": 1.2601664066314697, + "learning_rate": 4.3262817244669683e-05, + "loss": 0.7341, + "step": 8463 + }, + { + "epoch": 0.12473520935440645, + "grad_norm": 2.2060904502868652, + "learning_rate": 4.3205362785744083e-05, + "loss": 0.7421, + "step": 8494 + }, + { + "epoch": 0.125190447344751, + "grad_norm": 1.2388501167297363, + "learning_rate": 4.314770288177384e-05, + "loss": 0.7372, + "step": 8525 + }, + { + "epoch": 0.12564568533509554, + "grad_norm": 1.20314359664917, + "learning_rate": 4.308983818344313e-05, + "loss": 0.7308, + "step": 8556 + }, + { + "epoch": 0.1261009233254401, + "grad_norm": 1.2387797832489014, + "learning_rate": 4.3031769343747206e-05, + "loss": 0.7364, + "step": 8587 + }, + { + "epoch": 0.12655616131578465, + "grad_norm": 1.2150596380233765, + "learning_rate": 4.297349701798505e-05, + "loss": 0.7339, + "step": 8618 + }, + { + "epoch": 0.1270113993061292, + "grad_norm": 1.3160388469696045, + "learning_rate": 4.2915021863751916e-05, + "loss": 0.7448, + "step": 8649 + }, + { + "epoch": 0.12746663729647373, + "grad_norm": 1.2065149545669556, + "learning_rate": 4.285634454093198e-05, + "loss": 0.7373, + "step": 8680 + }, + { + "epoch": 0.12792187528681828, + "grad_norm": 1.2958154678344727, + "learning_rate": 4.279746571169086e-05, + "loss": 0.7282, + "step": 8711 + }, + { + "epoch": 0.12837711327716284, + "grad_norm": 1.2723522186279297, + "learning_rate": 4.2738386040468136e-05, + "loss": 0.7274, + "step": 8742 + }, + { + "epoch": 0.1288323512675074, + "grad_norm": 1.1709808111190796, + "learning_rate": 4.2679106193969866e-05, + "loss": 0.7459, + "step": 8773 + }, + { + "epoch": 0.12928758925785194, + "grad_norm": 1.3450511693954468, + "learning_rate": 4.261962684116106e-05, + "loss": 0.7334, + "step": 8804 + }, + { + "epoch": 0.1297428272481965, + "grad_norm": 1.3592984676361084, + "learning_rate": 4.2559948653258145e-05, + "loss": 0.7415, + "step": 8835 + }, + { + "epoch": 0.13019806523854102, + "grad_norm": 1.2315089702606201, + "learning_rate": 4.250007230372134e-05, + "loss": 0.7296, + "step": 8866 + }, + { + "epoch": 0.13065330322888558, + "grad_norm": 1.0693110227584839, + "learning_rate": 4.2439998468247126e-05, + "loss": 0.7269, + "step": 8897 + }, + { + "epoch": 0.13110854121923013, + "grad_norm": 1.0341922044754028, + "learning_rate": 4.2379727824760566e-05, + "loss": 0.7257, + "step": 8928 + }, + { + "epoch": 0.13156377920957468, + "grad_norm": 1.3500678539276123, + "learning_rate": 4.231926105340768e-05, + "loss": 0.7379, + "step": 8959 + }, + { + "epoch": 0.13201901719991924, + "grad_norm": 1.3482787609100342, + "learning_rate": 4.225859883654776e-05, + "loss": 0.7393, + "step": 8990 + }, + { + "epoch": 0.1324742551902638, + "grad_norm": 1.3126060962677002, + "learning_rate": 4.219774185874569e-05, + "loss": 0.7391, + "step": 9021 + }, + { + "epoch": 0.13292949318060834, + "grad_norm": 1.2438606023788452, + "learning_rate": 4.213669080676418e-05, + "loss": 0.7347, + "step": 9052 + }, + { + "epoch": 0.13338473117095287, + "grad_norm": 1.3756070137023926, + "learning_rate": 4.2075446369556056e-05, + "loss": 0.7321, + "step": 9083 + }, + { + "epoch": 0.13383996916129742, + "grad_norm": 1.189840316772461, + "learning_rate": 4.201400923825648e-05, + "loss": 0.7352, + "step": 9114 + }, + { + "epoch": 0.13429520715164198, + "grad_norm": 1.1982426643371582, + "learning_rate": 4.195238010617511e-05, + "loss": 0.7388, + "step": 9145 + }, + { + "epoch": 0.13475044514198653, + "grad_norm": 1.226491928100586, + "learning_rate": 4.1890559668788344e-05, + "loss": 0.7349, + "step": 9176 + }, + { + "epoch": 0.13520568313233108, + "grad_norm": 1.3030654191970825, + "learning_rate": 4.1828548623731405e-05, + "loss": 0.7371, + "step": 9207 + }, + { + "epoch": 0.13566092112267564, + "grad_norm": 1.17802894115448, + "learning_rate": 4.1766347670790506e-05, + "loss": 0.7431, + "step": 9238 + }, + { + "epoch": 0.13611615911302016, + "grad_norm": 1.2893658876419067, + "learning_rate": 4.170395751189495e-05, + "loss": 0.7258, + "step": 9269 + }, + { + "epoch": 0.13657139710336472, + "grad_norm": 1.193758487701416, + "learning_rate": 4.164137885110921e-05, + "loss": 0.7362, + "step": 9300 + }, + { + "epoch": 0.13702663509370927, + "grad_norm": 1.2750978469848633, + "learning_rate": 4.157861239462495e-05, + "loss": 0.7265, + "step": 9331 + }, + { + "epoch": 0.13748187308405382, + "grad_norm": 1.161464810371399, + "learning_rate": 4.1515658850753114e-05, + "loss": 0.7389, + "step": 9362 + }, + { + "epoch": 0.13793711107439838, + "grad_norm": 1.2749390602111816, + "learning_rate": 4.145251892991588e-05, + "loss": 0.7315, + "step": 9393 + }, + { + "epoch": 0.13839234906474293, + "grad_norm": 1.1729388236999512, + "learning_rate": 4.138919334463868e-05, + "loss": 0.7352, + "step": 9424 + }, + { + "epoch": 0.13884758705508746, + "grad_norm": 1.0987122058868408, + "learning_rate": 4.1325682809542124e-05, + "loss": 0.7347, + "step": 9455 + }, + { + "epoch": 0.139302825045432, + "grad_norm": 1.0855766534805298, + "learning_rate": 4.126198804133398e-05, + "loss": 0.7271, + "step": 9486 + }, + { + "epoch": 0.13975806303577656, + "grad_norm": 1.1782172918319702, + "learning_rate": 4.1198109758801055e-05, + "loss": 0.74, + "step": 9517 + }, + { + "epoch": 0.14021330102612112, + "grad_norm": 1.186962366104126, + "learning_rate": 4.113404868280107e-05, + "loss": 0.725, + "step": 9548 + }, + { + "epoch": 0.14066853901646567, + "grad_norm": 1.2993977069854736, + "learning_rate": 4.106980553625457e-05, + "loss": 0.7346, + "step": 9579 + }, + { + "epoch": 0.14112377700681022, + "grad_norm": 1.154897928237915, + "learning_rate": 4.100538104413674e-05, + "loss": 0.7348, + "step": 9610 + }, + { + "epoch": 0.14157901499715475, + "grad_norm": 1.203722357749939, + "learning_rate": 4.09407759334692e-05, + "loss": 0.7312, + "step": 9641 + }, + { + "epoch": 0.1420342529874993, + "grad_norm": 1.102858543395996, + "learning_rate": 4.087599093331186e-05, + "loss": 0.7227, + "step": 9672 + }, + { + "epoch": 0.14248949097784386, + "grad_norm": 1.2667406797409058, + "learning_rate": 4.081102677475462e-05, + "loss": 0.727, + "step": 9703 + }, + { + "epoch": 0.1429447289681884, + "grad_norm": 1.221291422843933, + "learning_rate": 4.0745884190909194e-05, + "loss": 0.7317, + "step": 9734 + }, + { + "epoch": 0.14339996695853297, + "grad_norm": 1.0426162481307983, + "learning_rate": 4.0680563916900796e-05, + "loss": 0.7299, + "step": 9765 + }, + { + "epoch": 0.14385520494887752, + "grad_norm": 1.2158912420272827, + "learning_rate": 4.0615066689859815e-05, + "loss": 0.7399, + "step": 9796 + }, + { + "epoch": 0.14431044293922204, + "grad_norm": 1.2689307928085327, + "learning_rate": 4.0549393248913584e-05, + "loss": 0.721, + "step": 9827 + }, + { + "epoch": 0.1447656809295666, + "grad_norm": 1.3168833255767822, + "learning_rate": 4.048354433517794e-05, + "loss": 0.7258, + "step": 9858 + }, + { + "epoch": 0.14522091891991115, + "grad_norm": 1.1966122388839722, + "learning_rate": 4.0417520691748916e-05, + "loss": 0.7375, + "step": 9889 + }, + { + "epoch": 0.1456761569102557, + "grad_norm": 1.1252835988998413, + "learning_rate": 4.035132306369438e-05, + "loss": 0.7254, + "step": 9920 + }, + { + "epoch": 0.14613139490060026, + "grad_norm": 1.186324119567871, + "learning_rate": 4.028495219804555e-05, + "loss": 0.7337, + "step": 9951 + }, + { + "epoch": 0.1465866328909448, + "grad_norm": 1.1606496572494507, + "learning_rate": 4.021840884378864e-05, + "loss": 0.7273, + "step": 9982 + }, + { + "epoch": 0.14704187088128934, + "grad_norm": 1.316298007965088, + "learning_rate": 4.015169375185633e-05, + "loss": 0.7276, + "step": 10013 + }, + { + "epoch": 0.1474971088716339, + "grad_norm": 1.224236249923706, + "learning_rate": 4.0084807675119396e-05, + "loss": 0.7328, + "step": 10044 + }, + { + "epoch": 0.14795234686197845, + "grad_norm": 1.227148175239563, + "learning_rate": 4.0017751368378106e-05, + "loss": 0.739, + "step": 10075 + }, + { + "epoch": 0.148407584852323, + "grad_norm": 1.1307867765426636, + "learning_rate": 3.995052558835377e-05, + "loss": 0.7324, + "step": 10106 + }, + { + "epoch": 0.14886282284266755, + "grad_norm": 1.2655375003814697, + "learning_rate": 3.988313109368017e-05, + "loss": 0.7263, + "step": 10137 + }, + { + "epoch": 0.1493180608330121, + "grad_norm": 1.1724634170532227, + "learning_rate": 3.981556864489504e-05, + "loss": 0.7289, + "step": 10168 + }, + { + "epoch": 0.14977329882335666, + "grad_norm": 1.140684723854065, + "learning_rate": 3.974783900443142e-05, + "loss": 0.7309, + "step": 10199 + }, + { + "epoch": 0.15022853681370119, + "grad_norm": 1.167183756828308, + "learning_rate": 3.9679942936609095e-05, + "loss": 0.729, + "step": 10230 + }, + { + "epoch": 0.15068377480404574, + "grad_norm": 1.2216546535491943, + "learning_rate": 3.961188120762596e-05, + "loss": 0.7288, + "step": 10261 + }, + { + "epoch": 0.1511390127943903, + "grad_norm": 1.227397084236145, + "learning_rate": 3.954365458554938e-05, + "loss": 0.7313, + "step": 10292 + }, + { + "epoch": 0.15159425078473485, + "grad_norm": 1.075441837310791, + "learning_rate": 3.947526384030751e-05, + "loss": 0.7306, + "step": 10323 + }, + { + "epoch": 0.1520494887750794, + "grad_norm": 1.1227167844772339, + "learning_rate": 3.9406709743680624e-05, + "loss": 0.7287, + "step": 10354 + }, + { + "epoch": 0.15250472676542395, + "grad_norm": 1.0665740966796875, + "learning_rate": 3.9337993069292366e-05, + "loss": 0.7246, + "step": 10385 + }, + { + "epoch": 0.15295996475576848, + "grad_norm": 1.250580072402954, + "learning_rate": 3.926911459260109e-05, + "loss": 0.7183, + "step": 10416 + }, + { + "epoch": 0.15341520274611303, + "grad_norm": 1.1809351444244385, + "learning_rate": 3.920007509089102e-05, + "loss": 0.7307, + "step": 10447 + }, + { + "epoch": 0.1538704407364576, + "grad_norm": 1.3934037685394287, + "learning_rate": 3.913087534326357e-05, + "loss": 0.7151, + "step": 10478 + }, + { + "epoch": 0.15432567872680214, + "grad_norm": 1.1272441148757935, + "learning_rate": 3.9061516130628475e-05, + "loss": 0.7309, + "step": 10509 + }, + { + "epoch": 0.1547809167171467, + "grad_norm": 1.2410191297531128, + "learning_rate": 3.8991998235695025e-05, + "loss": 0.7375, + "step": 10540 + }, + { + "epoch": 0.15523615470749125, + "grad_norm": 1.1426646709442139, + "learning_rate": 3.8922322442963224e-05, + "loss": 0.7342, + "step": 10571 + }, + { + "epoch": 0.15569139269783577, + "grad_norm": 1.2037526369094849, + "learning_rate": 3.885248953871491e-05, + "loss": 0.7329, + "step": 10602 + }, + { + "epoch": 0.15614663068818033, + "grad_norm": 1.2198660373687744, + "learning_rate": 3.8782500311004915e-05, + "loss": 0.7367, + "step": 10633 + }, + { + "epoch": 0.15660186867852488, + "grad_norm": 1.3129019737243652, + "learning_rate": 3.871235554965218e-05, + "loss": 0.7236, + "step": 10664 + }, + { + "epoch": 0.15705710666886943, + "grad_norm": 1.2182697057724, + "learning_rate": 3.864205604623078e-05, + "loss": 0.721, + "step": 10695 + }, + { + "epoch": 0.157512344659214, + "grad_norm": 1.1494821310043335, + "learning_rate": 3.857160259406107e-05, + "loss": 0.7223, + "step": 10726 + }, + { + "epoch": 0.15796758264955854, + "grad_norm": 1.0680466890335083, + "learning_rate": 3.8500995988200674e-05, + "loss": 0.7224, + "step": 10757 + }, + { + "epoch": 0.15842282063990307, + "grad_norm": 1.155052661895752, + "learning_rate": 3.843023702543556e-05, + "loss": 0.7268, + "step": 10788 + }, + { + "epoch": 0.15887805863024762, + "grad_norm": 1.1415716409683228, + "learning_rate": 3.8359326504270984e-05, + "loss": 0.7167, + "step": 10819 + }, + { + "epoch": 0.15933329662059217, + "grad_norm": 1.1486759185791016, + "learning_rate": 3.828826522492255e-05, + "loss": 0.7236, + "step": 10850 + }, + { + "epoch": 0.15978853461093673, + "grad_norm": 1.1916829347610474, + "learning_rate": 3.821705398930713e-05, + "loss": 0.7269, + "step": 10881 + }, + { + "epoch": 0.16024377260128128, + "grad_norm": 1.180760383605957, + "learning_rate": 3.814569360103385e-05, + "loss": 0.724, + "step": 10912 + }, + { + "epoch": 0.16069901059162583, + "grad_norm": 1.197651982307434, + "learning_rate": 3.807418486539499e-05, + "loss": 0.7265, + "step": 10943 + }, + { + "epoch": 0.16115424858197036, + "grad_norm": 1.1579455137252808, + "learning_rate": 3.80025285893569e-05, + "loss": 0.7247, + "step": 10974 + }, + { + "epoch": 0.16160948657231491, + "grad_norm": 1.1810616254806519, + "learning_rate": 3.793072558155093e-05, + "loss": 0.7256, + "step": 11005 + }, + { + "epoch": 0.16206472456265947, + "grad_norm": 1.1510332822799683, + "learning_rate": 3.785877665226426e-05, + "loss": 0.722, + "step": 11036 + }, + { + "epoch": 0.16251996255300402, + "grad_norm": 1.0499261617660522, + "learning_rate": 3.778668261343079e-05, + "loss": 0.7255, + "step": 11067 + }, + { + "epoch": 0.16297520054334858, + "grad_norm": 1.1509817838668823, + "learning_rate": 3.771444427862192e-05, + "loss": 0.7147, + "step": 11098 + }, + { + "epoch": 0.16343043853369313, + "grad_norm": 1.1638833284378052, + "learning_rate": 3.7642062463037465e-05, + "loss": 0.7187, + "step": 11129 + }, + { + "epoch": 0.16388567652403765, + "grad_norm": 1.024095058441162, + "learning_rate": 3.7569537983496373e-05, + "loss": 0.7244, + "step": 11160 + }, + { + "epoch": 0.1643409145143822, + "grad_norm": 1.1326078176498413, + "learning_rate": 3.749687165842753e-05, + "loss": 0.7221, + "step": 11191 + }, + { + "epoch": 0.16479615250472676, + "grad_norm": 1.1863468885421753, + "learning_rate": 3.7424064307860536e-05, + "loss": 0.725, + "step": 11222 + }, + { + "epoch": 0.16525139049507132, + "grad_norm": 1.2948099374771118, + "learning_rate": 3.735111675341645e-05, + "loss": 0.7249, + "step": 11253 + }, + { + "epoch": 0.16570662848541587, + "grad_norm": 1.1384631395339966, + "learning_rate": 3.7278029818298524e-05, + "loss": 0.7203, + "step": 11284 + }, + { + "epoch": 0.16616186647576042, + "grad_norm": 1.1335748434066772, + "learning_rate": 3.720480432728287e-05, + "loss": 0.7183, + "step": 11315 + }, + { + "epoch": 0.16661710446610495, + "grad_norm": 1.2514820098876953, + "learning_rate": 3.71314411067092e-05, + "loss": 0.7216, + "step": 11346 + }, + { + "epoch": 0.1670723424564495, + "grad_norm": 1.128739833831787, + "learning_rate": 3.70579409844715e-05, + "loss": 0.7234, + "step": 11377 + }, + { + "epoch": 0.16752758044679406, + "grad_norm": 1.152342677116394, + "learning_rate": 3.698430479000865e-05, + "loss": 0.7304, + "step": 11408 + }, + { + "epoch": 0.1679828184371386, + "grad_norm": 1.222424864768982, + "learning_rate": 3.691053335429509e-05, + "loss": 0.7164, + "step": 11439 + }, + { + "epoch": 0.16843805642748316, + "grad_norm": 1.2406268119812012, + "learning_rate": 3.683662750983147e-05, + "loss": 0.7157, + "step": 11470 + }, + { + "epoch": 0.16889329441782772, + "grad_norm": 1.1266759634017944, + "learning_rate": 3.676258809063518e-05, + "loss": 0.7228, + "step": 11501 + }, + { + "epoch": 0.16934853240817227, + "grad_norm": 1.1239662170410156, + "learning_rate": 3.6688415932231004e-05, + "loss": 0.7266, + "step": 11532 + }, + { + "epoch": 0.1698037703985168, + "grad_norm": 1.0813047885894775, + "learning_rate": 3.661411187164166e-05, + "loss": 0.7278, + "step": 11563 + }, + { + "epoch": 0.17025900838886135, + "grad_norm": 56.909889221191406, + "learning_rate": 3.65396767473784e-05, + "loss": 0.7163, + "step": 11594 + }, + { + "epoch": 0.1707142463792059, + "grad_norm": 1.0749647617340088, + "learning_rate": 3.6465111399431465e-05, + "loss": 0.7182, + "step": 11625 + }, + { + "epoch": 0.17116948436955046, + "grad_norm": 1.0705510377883911, + "learning_rate": 3.6390416669260674e-05, + "loss": 0.726, + "step": 11656 + }, + { + "epoch": 0.171624722359895, + "grad_norm": 1.1083126068115234, + "learning_rate": 3.63155933997859e-05, + "loss": 0.7262, + "step": 11687 + }, + { + "epoch": 0.17207996035023956, + "grad_norm": 1.2302770614624023, + "learning_rate": 3.624064243537758e-05, + "loss": 0.7229, + "step": 11718 + }, + { + "epoch": 0.1725351983405841, + "grad_norm": 1.240893006324768, + "learning_rate": 3.616556462184716e-05, + "loss": 0.7228, + "step": 11749 + }, + { + "epoch": 0.17299043633092864, + "grad_norm": 1.1897794008255005, + "learning_rate": 3.609036080643755e-05, + "loss": 0.7178, + "step": 11780 + }, + { + "epoch": 0.1734456743212732, + "grad_norm": 1.244950294494629, + "learning_rate": 3.60150318378136e-05, + "loss": 0.7244, + "step": 11811 + }, + { + "epoch": 0.17390091231161775, + "grad_norm": 1.1689528226852417, + "learning_rate": 3.5939578566052465e-05, + "loss": 0.7181, + "step": 11842 + }, + { + "epoch": 0.1743561503019623, + "grad_norm": 1.1300387382507324, + "learning_rate": 3.586400184263408e-05, + "loss": 0.7176, + "step": 11873 + }, + { + "epoch": 0.17481138829230686, + "grad_norm": 1.0884469747543335, + "learning_rate": 3.578830252043148e-05, + "loss": 0.7182, + "step": 11904 + }, + { + "epoch": 0.17526662628265138, + "grad_norm": 1.119352102279663, + "learning_rate": 3.571248145370125e-05, + "loss": 0.7223, + "step": 11935 + }, + { + "epoch": 0.17572186427299594, + "grad_norm": 1.0979810953140259, + "learning_rate": 3.5636539498073794e-05, + "loss": 0.7233, + "step": 11966 + }, + { + "epoch": 0.1761771022633405, + "grad_norm": 1.138571858406067, + "learning_rate": 3.556047751054378e-05, + "loss": 0.7279, + "step": 11997 + }, + { + "epoch": 0.17663234025368504, + "grad_norm": 1.2011150121688843, + "learning_rate": 3.548429634946039e-05, + "loss": 0.717, + "step": 12028 + }, + { + "epoch": 0.1770875782440296, + "grad_norm": 1.119472861289978, + "learning_rate": 3.540799687451768e-05, + "loss": 0.7227, + "step": 12059 + }, + { + "epoch": 0.17754281623437415, + "grad_norm": 1.089117169380188, + "learning_rate": 3.533157994674485e-05, + "loss": 0.7132, + "step": 12090 + }, + { + "epoch": 0.17799805422471868, + "grad_norm": 1.0937649011611938, + "learning_rate": 3.5255046428496546e-05, + "loss": 0.7285, + "step": 12121 + }, + { + "epoch": 0.17845329221506323, + "grad_norm": 1.0359808206558228, + "learning_rate": 3.517839718344311e-05, + "loss": 0.7178, + "step": 12152 + }, + { + "epoch": 0.17890853020540778, + "grad_norm": 1.1309690475463867, + "learning_rate": 3.510163307656086e-05, + "loss": 0.713, + "step": 12183 + }, + { + "epoch": 0.17936376819575234, + "grad_norm": 1.126597285270691, + "learning_rate": 3.5024754974122324e-05, + "loss": 0.7139, + "step": 12214 + }, + { + "epoch": 0.1798190061860969, + "grad_norm": 1.29429292678833, + "learning_rate": 3.494776374368643e-05, + "loss": 0.7261, + "step": 12245 + }, + { + "epoch": 0.18027424417644144, + "grad_norm": 1.1156132221221924, + "learning_rate": 3.4870660254088724e-05, + "loss": 0.7124, + "step": 12276 + }, + { + "epoch": 0.18072948216678597, + "grad_norm": 1.1548200845718384, + "learning_rate": 3.479344537543164e-05, + "loss": 0.7195, + "step": 12307 + }, + { + "epoch": 0.18118472015713052, + "grad_norm": 1.1318944692611694, + "learning_rate": 3.4716119979074565e-05, + "loss": 0.7215, + "step": 12338 + }, + { + "epoch": 0.18163995814747508, + "grad_norm": 1.1952061653137207, + "learning_rate": 3.463868493762412e-05, + "loss": 0.7159, + "step": 12369 + }, + { + "epoch": 0.18209519613781963, + "grad_norm": 1.1703433990478516, + "learning_rate": 3.456114112492418e-05, + "loss": 0.7145, + "step": 12400 + }, + { + "epoch": 0.18255043412816419, + "grad_norm": 1.065319538116455, + "learning_rate": 3.4483489416046164e-05, + "loss": 0.7108, + "step": 12431 + }, + { + "epoch": 0.18300567211850874, + "grad_norm": 1.278533697128296, + "learning_rate": 3.440573068727905e-05, + "loss": 0.7116, + "step": 12462 + }, + { + "epoch": 0.18346091010885326, + "grad_norm": 1.1642391681671143, + "learning_rate": 3.4327865816119495e-05, + "loss": 0.716, + "step": 12493 + }, + { + "epoch": 0.18391614809919782, + "grad_norm": 1.2108798027038574, + "learning_rate": 3.4249895681262025e-05, + "loss": 0.7194, + "step": 12524 + }, + { + "epoch": 0.18437138608954237, + "grad_norm": 1.085163950920105, + "learning_rate": 3.417182116258899e-05, + "loss": 0.7166, + "step": 12555 + }, + { + "epoch": 0.18482662407988693, + "grad_norm": 1.124064564704895, + "learning_rate": 3.409364314116074e-05, + "loss": 0.7218, + "step": 12586 + }, + { + "epoch": 0.18528186207023148, + "grad_norm": 1.1817457675933838, + "learning_rate": 3.401536249920559e-05, + "loss": 0.7161, + "step": 12617 + }, + { + "epoch": 0.18573710006057603, + "grad_norm": 1.1430065631866455, + "learning_rate": 3.393698012010998e-05, + "loss": 0.7223, + "step": 12648 + }, + { + "epoch": 0.18619233805092056, + "grad_norm": 1.151633620262146, + "learning_rate": 3.385849688840839e-05, + "loss": 0.7181, + "step": 12679 + }, + { + "epoch": 0.1866475760412651, + "grad_norm": 1.1847879886627197, + "learning_rate": 3.3779913689773414e-05, + "loss": 0.7144, + "step": 12710 + }, + { + "epoch": 0.18710281403160967, + "grad_norm": 1.0997257232666016, + "learning_rate": 3.370123141100578e-05, + "loss": 0.7179, + "step": 12741 + }, + { + "epoch": 0.18755805202195422, + "grad_norm": 1.3207893371582031, + "learning_rate": 3.3622450940024305e-05, + "loss": 0.7147, + "step": 12772 + }, + { + "epoch": 0.18801329001229877, + "grad_norm": 1.1414912939071655, + "learning_rate": 3.35435731658559e-05, + "loss": 0.7258, + "step": 12803 + }, + { + "epoch": 0.18846852800264333, + "grad_norm": 1.1708139181137085, + "learning_rate": 3.346459897862552e-05, + "loss": 0.7222, + "step": 12834 + }, + { + "epoch": 0.18892376599298788, + "grad_norm": 1.0472559928894043, + "learning_rate": 3.338552926954613e-05, + "loss": 0.7168, + "step": 12865 + }, + { + "epoch": 0.1893790039833324, + "grad_norm": 1.1117762327194214, + "learning_rate": 3.330636493090868e-05, + "loss": 0.7128, + "step": 12896 + }, + { + "epoch": 0.18983424197367696, + "grad_norm": 1.1351251602172852, + "learning_rate": 3.322710685607193e-05, + "loss": 0.72, + "step": 12927 + }, + { + "epoch": 0.1902894799640215, + "grad_norm": 1.1530694961547852, + "learning_rate": 3.314775593945251e-05, + "loss": 0.7133, + "step": 12958 + }, + { + "epoch": 0.19074471795436607, + "grad_norm": 1.1590766906738281, + "learning_rate": 3.3068313076514714e-05, + "loss": 0.7236, + "step": 12989 + }, + { + "epoch": 0.19119995594471062, + "grad_norm": 1.0657463073730469, + "learning_rate": 3.298877916376047e-05, + "loss": 0.7159, + "step": 13020 + }, + { + "epoch": 0.19165519393505517, + "grad_norm": 1.090234637260437, + "learning_rate": 3.290915509871915e-05, + "loss": 0.7249, + "step": 13051 + }, + { + "epoch": 0.1921104319253997, + "grad_norm": 1.1001083850860596, + "learning_rate": 3.282944177993753e-05, + "loss": 0.721, + "step": 13082 + }, + { + "epoch": 0.19256566991574425, + "grad_norm": 1.0616897344589233, + "learning_rate": 3.274964010696957e-05, + "loss": 0.7145, + "step": 13113 + }, + { + "epoch": 0.1930209079060888, + "grad_norm": 1.05698823928833, + "learning_rate": 3.266975098036629e-05, + "loss": 0.7106, + "step": 13144 + }, + { + "epoch": 0.19347614589643336, + "grad_norm": 1.0747356414794922, + "learning_rate": 3.258977530166562e-05, + "loss": 0.7199, + "step": 13175 + }, + { + "epoch": 0.1939313838867779, + "grad_norm": 1.112629771232605, + "learning_rate": 3.250971397338227e-05, + "loss": 0.7088, + "step": 13206 + }, + { + "epoch": 0.19438662187712247, + "grad_norm": 1.13507878780365, + "learning_rate": 3.2429567898997404e-05, + "loss": 0.7084, + "step": 13237 + }, + { + "epoch": 0.194841859867467, + "grad_norm": 1.137793779373169, + "learning_rate": 3.234933798294859e-05, + "loss": 0.7121, + "step": 13268 + }, + { + "epoch": 0.19529709785781155, + "grad_norm": 1.263027548789978, + "learning_rate": 3.2269025130619535e-05, + "loss": 0.7117, + "step": 13299 + }, + { + "epoch": 0.1957523358481561, + "grad_norm": 1.1799430847167969, + "learning_rate": 3.218863024832985e-05, + "loss": 0.7144, + "step": 13330 + }, + { + "epoch": 0.19620757383850065, + "grad_norm": 1.040297508239746, + "learning_rate": 3.2108154243324864e-05, + "loss": 0.7089, + "step": 13361 + }, + { + "epoch": 0.1966628118288452, + "grad_norm": 1.110816240310669, + "learning_rate": 3.2027598023765345e-05, + "loss": 0.7166, + "step": 13392 + }, + { + "epoch": 0.19711804981918976, + "grad_norm": 1.0399518013000488, + "learning_rate": 3.194696249871729e-05, + "loss": 0.7081, + "step": 13423 + }, + { + "epoch": 0.1975732878095343, + "grad_norm": 1.1946238279342651, + "learning_rate": 3.186624857814164e-05, + "loss": 0.7112, + "step": 13454 + }, + { + "epoch": 0.19802852579987884, + "grad_norm": 1.1289162635803223, + "learning_rate": 3.178545717288401e-05, + "loss": 0.7079, + "step": 13485 + }, + { + "epoch": 0.1984837637902234, + "grad_norm": 1.17491614818573, + "learning_rate": 3.170458919466444e-05, + "loss": 0.7117, + "step": 13516 + }, + { + "epoch": 0.19893900178056795, + "grad_norm": 1.1498847007751465, + "learning_rate": 3.1623645556067063e-05, + "loss": 0.7131, + "step": 13547 + }, + { + "epoch": 0.1993942397709125, + "grad_norm": 1.2516326904296875, + "learning_rate": 3.154262717052985e-05, + "loss": 0.7145, + "step": 13578 + }, + { + "epoch": 0.19984947776125705, + "grad_norm": 1.2112692594528198, + "learning_rate": 3.146153495233426e-05, + "loss": 0.7098, + "step": 13609 + }, + { + "epoch": 0.20030471575160158, + "grad_norm": 0.989539384841919, + "learning_rate": 3.1380369816594944e-05, + "loss": 0.7182, + "step": 13640 + }, + { + "epoch": 0.20075995374194613, + "grad_norm": 1.0849697589874268, + "learning_rate": 3.129913267924946e-05, + "loss": 0.7153, + "step": 13671 + }, + { + "epoch": 0.2012151917322907, + "grad_norm": 1.1133675575256348, + "learning_rate": 3.121782445704782e-05, + "loss": 0.7104, + "step": 13702 + }, + { + "epoch": 0.20167042972263524, + "grad_norm": 1.1086009740829468, + "learning_rate": 3.11364460675423e-05, + "loss": 0.7125, + "step": 13733 + }, + { + "epoch": 0.2021256677129798, + "grad_norm": 1.0160905122756958, + "learning_rate": 3.1054998429076934e-05, + "loss": 0.7055, + "step": 13764 + }, + { + "epoch": 0.20258090570332435, + "grad_norm": 1.0811957120895386, + "learning_rate": 3.097348246077728e-05, + "loss": 0.7105, + "step": 13795 + }, + { + "epoch": 0.20303614369366887, + "grad_norm": 1.0012198686599731, + "learning_rate": 3.0891899082539924e-05, + "loss": 0.7134, + "step": 13826 + }, + { + "epoch": 0.20349138168401343, + "grad_norm": 1.2298318147659302, + "learning_rate": 3.0810249215022233e-05, + "loss": 0.7054, + "step": 13857 + }, + { + "epoch": 0.20394661967435798, + "grad_norm": 1.1324481964111328, + "learning_rate": 3.0728533779631865e-05, + "loss": 0.7023, + "step": 13888 + }, + { + "epoch": 0.20440185766470254, + "grad_norm": 1.1454377174377441, + "learning_rate": 3.064675369851637e-05, + "loss": 0.7124, + "step": 13919 + }, + { + "epoch": 0.2048570956550471, + "grad_norm": 1.0461289882659912, + "learning_rate": 3.056490989455289e-05, + "loss": 0.7159, + "step": 13950 + }, + { + "epoch": 0.20531233364539164, + "grad_norm": 1.0020164251327515, + "learning_rate": 3.0483003291337596e-05, + "loss": 0.702, + "step": 13981 + }, + { + "epoch": 0.20576757163573617, + "grad_norm": 1.0952621698379517, + "learning_rate": 3.040103481317539e-05, + "loss": 0.7108, + "step": 14012 + }, + { + "epoch": 0.20622280962608072, + "grad_norm": 1.129813551902771, + "learning_rate": 3.03190053850694e-05, + "loss": 0.7043, + "step": 14043 + }, + { + "epoch": 0.20667804761642528, + "grad_norm": 1.012137770652771, + "learning_rate": 3.0236915932710573e-05, + "loss": 0.7002, + "step": 14074 + }, + { + "epoch": 0.20713328560676983, + "grad_norm": 1.1774550676345825, + "learning_rate": 3.0154767382467232e-05, + "loss": 0.6996, + "step": 14105 + }, + { + "epoch": 0.20758852359711438, + "grad_norm": 1.0878807306289673, + "learning_rate": 3.0072560661374582e-05, + "loss": 0.7075, + "step": 14136 + }, + { + "epoch": 0.20804376158745894, + "grad_norm": 1.1599992513656616, + "learning_rate": 2.999029669712431e-05, + "loss": 0.7165, + "step": 14167 + }, + { + "epoch": 0.2084989995778035, + "grad_norm": 1.184403419494629, + "learning_rate": 2.990797641805408e-05, + "loss": 0.709, + "step": 14198 + }, + { + "epoch": 0.20895423756814802, + "grad_norm": 1.0857961177825928, + "learning_rate": 2.982560075313704e-05, + "loss": 0.6986, + "step": 14229 + }, + { + "epoch": 0.20940947555849257, + "grad_norm": 1.0922378301620483, + "learning_rate": 2.9743170631971368e-05, + "loss": 0.7085, + "step": 14260 + }, + { + "epoch": 0.20986471354883712, + "grad_norm": 1.0843058824539185, + "learning_rate": 2.9660686984769792e-05, + "loss": 0.7114, + "step": 14291 + }, + { + "epoch": 0.21031995153918168, + "grad_norm": 1.2273341417312622, + "learning_rate": 2.9578150742349047e-05, + "loss": 0.7092, + "step": 14322 + }, + { + "epoch": 0.21077518952952623, + "grad_norm": 1.1041830778121948, + "learning_rate": 2.949556283611942e-05, + "loss": 0.7086, + "step": 14353 + }, + { + "epoch": 0.21123042751987078, + "grad_norm": 1.1019948720932007, + "learning_rate": 2.9412924198074206e-05, + "loss": 0.7095, + "step": 14384 + }, + { + "epoch": 0.2116856655102153, + "grad_norm": 1.1595929861068726, + "learning_rate": 2.9330235760779208e-05, + "loss": 0.7083, + "step": 14415 + }, + { + "epoch": 0.21214090350055986, + "grad_norm": 1.252021312713623, + "learning_rate": 2.9247498457362188e-05, + "loss": 0.7045, + "step": 14446 + }, + { + "epoch": 0.21259614149090442, + "grad_norm": 1.1469565629959106, + "learning_rate": 2.9164713221502373e-05, + "loss": 0.6995, + "step": 14477 + }, + { + "epoch": 0.21305137948124897, + "grad_norm": 1.1796189546585083, + "learning_rate": 2.9081880987419912e-05, + "loss": 0.7101, + "step": 14508 + }, + { + "epoch": 0.21350661747159352, + "grad_norm": 1.183271884918213, + "learning_rate": 2.8999002689865296e-05, + "loss": 0.7074, + "step": 14539 + }, + { + "epoch": 0.21396185546193808, + "grad_norm": 1.2489229440689087, + "learning_rate": 2.8916079264108852e-05, + "loss": 0.7126, + "step": 14570 + }, + { + "epoch": 0.2144170934522826, + "grad_norm": 1.059584379196167, + "learning_rate": 2.883311164593017e-05, + "loss": 0.7045, + "step": 14601 + }, + { + "epoch": 0.21487233144262716, + "grad_norm": 1.1565430164337158, + "learning_rate": 2.875010077160754e-05, + "loss": 0.7075, + "step": 14632 + }, + { + "epoch": 0.2153275694329717, + "grad_norm": 1.0368664264678955, + "learning_rate": 2.866704757790741e-05, + "loss": 0.7065, + "step": 14663 + }, + { + "epoch": 0.21578280742331626, + "grad_norm": 1.1395366191864014, + "learning_rate": 2.858395300207376e-05, + "loss": 0.7029, + "step": 14694 + }, + { + "epoch": 0.21623804541366082, + "grad_norm": 1.077816128730774, + "learning_rate": 2.8500817981817607e-05, + "loss": 0.7078, + "step": 14725 + }, + { + "epoch": 0.21669328340400537, + "grad_norm": 1.2020843029022217, + "learning_rate": 2.8417643455306336e-05, + "loss": 0.7011, + "step": 14756 + }, + { + "epoch": 0.2171485213943499, + "grad_norm": 1.0293958187103271, + "learning_rate": 2.8334430361153185e-05, + "loss": 0.7038, + "step": 14787 + }, + { + "epoch": 0.21760375938469445, + "grad_norm": 1.2163504362106323, + "learning_rate": 2.8251179638406612e-05, + "loss": 0.7029, + "step": 14818 + }, + { + "epoch": 0.218058997375039, + "grad_norm": 1.0222349166870117, + "learning_rate": 2.8167892226539704e-05, + "loss": 0.7084, + "step": 14849 + }, + { + "epoch": 0.21851423536538356, + "grad_norm": 1.0532034635543823, + "learning_rate": 2.8084569065439588e-05, + "loss": 0.7162, + "step": 14880 + }, + { + "epoch": 0.2189694733557281, + "grad_norm": 0.9981489181518555, + "learning_rate": 2.8001211095396807e-05, + "loss": 0.7088, + "step": 14911 + }, + { + "epoch": 0.21942471134607267, + "grad_norm": 1.0804964303970337, + "learning_rate": 2.791781925709473e-05, + "loss": 0.7035, + "step": 14942 + }, + { + "epoch": 0.2198799493364172, + "grad_norm": 1.1326287984848022, + "learning_rate": 2.7834394491598908e-05, + "loss": 0.708, + "step": 14973 + }, + { + "epoch": 0.22033518732676174, + "grad_norm": 1.1164413690567017, + "learning_rate": 2.7750937740346485e-05, + "loss": 0.7047, + "step": 15004 + }, + { + "epoch": 0.2207904253171063, + "grad_norm": 1.1455166339874268, + "learning_rate": 2.7667449945135564e-05, + "loss": 0.708, + "step": 15035 + }, + { + "epoch": 0.22124566330745085, + "grad_norm": 1.1329716444015503, + "learning_rate": 2.7583932048114557e-05, + "loss": 0.6923, + "step": 15066 + }, + { + "epoch": 0.2217009012977954, + "grad_norm": 1.3479692935943604, + "learning_rate": 2.7500384991771587e-05, + "loss": 0.7119, + "step": 15097 + }, + { + "epoch": 0.22215613928813996, + "grad_norm": 1.1160681247711182, + "learning_rate": 2.7416809718923825e-05, + "loss": 0.7031, + "step": 15128 + }, + { + "epoch": 0.22261137727848448, + "grad_norm": 1.1188467741012573, + "learning_rate": 2.7333207172706864e-05, + "loss": 0.7029, + "step": 15159 + }, + { + "epoch": 0.22306661526882904, + "grad_norm": 0.9910608530044556, + "learning_rate": 2.7249578296564088e-05, + "loss": 0.6996, + "step": 15190 + }, + { + "epoch": 0.2235218532591736, + "grad_norm": 1.100176453590393, + "learning_rate": 2.7165924034235973e-05, + "loss": 0.7027, + "step": 15221 + }, + { + "epoch": 0.22397709124951815, + "grad_norm": 1.1848573684692383, + "learning_rate": 2.708224532974953e-05, + "loss": 0.7111, + "step": 15252 + }, + { + "epoch": 0.2244323292398627, + "grad_norm": 1.1293883323669434, + "learning_rate": 2.6998543127407538e-05, + "loss": 0.7045, + "step": 15283 + }, + { + "epoch": 0.22488756723020725, + "grad_norm": 1.1224740743637085, + "learning_rate": 2.6914818371777988e-05, + "loss": 0.7113, + "step": 15314 + }, + { + "epoch": 0.2253428052205518, + "grad_norm": 1.2419655323028564, + "learning_rate": 2.6831072007683373e-05, + "loss": 0.7062, + "step": 15345 + }, + { + "epoch": 0.22579804321089633, + "grad_norm": 1.1906564235687256, + "learning_rate": 2.6747304980190018e-05, + "loss": 0.7112, + "step": 15376 + }, + { + "epoch": 0.22625328120124089, + "grad_norm": 1.0610102415084839, + "learning_rate": 2.6663518234597453e-05, + "loss": 0.7052, + "step": 15407 + }, + { + "epoch": 0.22670851919158544, + "grad_norm": 1.3254245519638062, + "learning_rate": 2.6579712716427696e-05, + "loss": 0.6975, + "step": 15438 + }, + { + "epoch": 0.22716375718193, + "grad_norm": 1.1128469705581665, + "learning_rate": 2.6495889371414652e-05, + "loss": 0.6993, + "step": 15469 + }, + { + "epoch": 0.22761899517227455, + "grad_norm": 1.0977287292480469, + "learning_rate": 2.6412049145493367e-05, + "loss": 0.7001, + "step": 15500 + }, + { + "epoch": 0.2280742331626191, + "grad_norm": 0.9699016213417053, + "learning_rate": 2.632819298478939e-05, + "loss": 0.7082, + "step": 15531 + }, + { + "epoch": 0.22852947115296363, + "grad_norm": 1.1493170261383057, + "learning_rate": 2.6244321835608105e-05, + "loss": 0.7019, + "step": 15562 + }, + { + "epoch": 0.22898470914330818, + "grad_norm": 1.1549670696258545, + "learning_rate": 2.6160436644424024e-05, + "loss": 0.7087, + "step": 15593 + }, + { + "epoch": 0.22943994713365273, + "grad_norm": 1.2285927534103394, + "learning_rate": 2.6076538357870133e-05, + "loss": 0.695, + "step": 15624 + }, + { + "epoch": 0.2298951851239973, + "grad_norm": 1.0625406503677368, + "learning_rate": 2.5992627922727196e-05, + "loss": 0.7072, + "step": 15655 + }, + { + "epoch": 0.23035042311434184, + "grad_norm": 1.2031610012054443, + "learning_rate": 2.5908706285913066e-05, + "loss": 0.6952, + "step": 15686 + }, + { + "epoch": 0.2308056611046864, + "grad_norm": 1.0590460300445557, + "learning_rate": 2.5824774394472008e-05, + "loss": 0.6981, + "step": 15717 + }, + { + "epoch": 0.23126089909503092, + "grad_norm": 1.0085610151290894, + "learning_rate": 2.5740833195563996e-05, + "loss": 0.7006, + "step": 15748 + }, + { + "epoch": 0.23171613708537547, + "grad_norm": 1.1644418239593506, + "learning_rate": 2.5656883636454067e-05, + "loss": 0.7023, + "step": 15779 + }, + { + "epoch": 0.23217137507572003, + "grad_norm": 1.0243310928344727, + "learning_rate": 2.557292666450159e-05, + "loss": 0.7106, + "step": 15810 + }, + { + "epoch": 0.23262661306606458, + "grad_norm": 1.0970982313156128, + "learning_rate": 2.5488963227149566e-05, + "loss": 0.7018, + "step": 15841 + }, + { + "epoch": 0.23308185105640913, + "grad_norm": 1.0774227380752563, + "learning_rate": 2.5404994271913983e-05, + "loss": 0.7058, + "step": 15872 + }, + { + "epoch": 0.2335370890467537, + "grad_norm": 1.2018071413040161, + "learning_rate": 2.5321020746373085e-05, + "loss": 0.7072, + "step": 15903 + }, + { + "epoch": 0.2339923270370982, + "grad_norm": 1.1195406913757324, + "learning_rate": 2.52370435981567e-05, + "loss": 0.6971, + "step": 15934 + }, + { + "epoch": 0.23444756502744277, + "grad_norm": 1.0707147121429443, + "learning_rate": 2.5153063774935533e-05, + "loss": 0.7005, + "step": 15965 + }, + { + "epoch": 0.23490280301778732, + "grad_norm": 1.0488064289093018, + "learning_rate": 2.506908222441045e-05, + "loss": 0.6994, + "step": 15996 + }, + { + "epoch": 0.23535804100813187, + "grad_norm": 1.0699859857559204, + "learning_rate": 2.498509989430187e-05, + "loss": 0.6931, + "step": 16027 + }, + { + "epoch": 0.23581327899847643, + "grad_norm": 1.0461689233779907, + "learning_rate": 2.4901117732338958e-05, + "loss": 0.7022, + "step": 16058 + }, + { + "epoch": 0.23626851698882098, + "grad_norm": 1.096604824066162, + "learning_rate": 2.481713668624899e-05, + "loss": 0.7043, + "step": 16089 + }, + { + "epoch": 0.2367237549791655, + "grad_norm": 1.0687739849090576, + "learning_rate": 2.4733157703746663e-05, + "loss": 0.7043, + "step": 16120 + }, + { + "epoch": 0.23717899296951006, + "grad_norm": 1.1307755708694458, + "learning_rate": 2.4649181732523392e-05, + "loss": 0.7059, + "step": 16151 + }, + { + "epoch": 0.23763423095985461, + "grad_norm": 1.0404301881790161, + "learning_rate": 2.4565209720236582e-05, + "loss": 0.6981, + "step": 16182 + }, + { + "epoch": 0.23808946895019917, + "grad_norm": 1.0836886167526245, + "learning_rate": 2.4481242614498975e-05, + "loss": 0.7145, + "step": 16213 + }, + { + "epoch": 0.23854470694054372, + "grad_norm": 1.0622589588165283, + "learning_rate": 2.439728136286796e-05, + "loss": 0.7069, + "step": 16244 + }, + { + "epoch": 0.23899994493088828, + "grad_norm": 1.1610299348831177, + "learning_rate": 2.4313326912834852e-05, + "loss": 0.7022, + "step": 16275 + }, + { + "epoch": 0.2394551829212328, + "grad_norm": 1.004273772239685, + "learning_rate": 2.4229380211814206e-05, + "loss": 0.6905, + "step": 16306 + }, + { + "epoch": 0.23991042091157735, + "grad_norm": 1.0684071779251099, + "learning_rate": 2.4145442207133124e-05, + "loss": 0.6977, + "step": 16337 + }, + { + "epoch": 0.2403656589019219, + "grad_norm": 0.9177312850952148, + "learning_rate": 2.406151384602059e-05, + "loss": 0.6979, + "step": 16368 + }, + { + "epoch": 0.24082089689226646, + "grad_norm": 1.0734107494354248, + "learning_rate": 2.3977596075596747e-05, + "loss": 0.6987, + "step": 16399 + }, + { + "epoch": 0.24127613488261102, + "grad_norm": 1.1414164304733276, + "learning_rate": 2.3893689842862223e-05, + "loss": 0.6927, + "step": 16430 + }, + { + "epoch": 0.24173137287295557, + "grad_norm": 1.1547383069992065, + "learning_rate": 2.3809796094687475e-05, + "loss": 0.7053, + "step": 16461 + }, + { + "epoch": 0.2421866108633001, + "grad_norm": 1.0909677743911743, + "learning_rate": 2.372591577780202e-05, + "loss": 0.6987, + "step": 16492 + }, + { + "epoch": 0.24264184885364465, + "grad_norm": 1.0706005096435547, + "learning_rate": 2.3642049838783838e-05, + "loss": 0.7014, + "step": 16523 + }, + { + "epoch": 0.2430970868439892, + "grad_norm": 1.0389344692230225, + "learning_rate": 2.3558199224048666e-05, + "loss": 0.695, + "step": 16554 + }, + { + "epoch": 0.24355232483433376, + "grad_norm": 1.0836538076400757, + "learning_rate": 2.347436487983929e-05, + "loss": 0.7004, + "step": 16585 + }, + { + "epoch": 0.2440075628246783, + "grad_norm": 1.0748459100723267, + "learning_rate": 2.3390547752214888e-05, + "loss": 0.7018, + "step": 16616 + }, + { + "epoch": 0.24446280081502286, + "grad_norm": 1.097935438156128, + "learning_rate": 2.330674878704035e-05, + "loss": 0.706, + "step": 16647 + }, + { + "epoch": 0.24491803880536742, + "grad_norm": 1.1082520484924316, + "learning_rate": 2.322296892997561e-05, + "loss": 0.7012, + "step": 16678 + }, + { + "epoch": 0.24537327679571194, + "grad_norm": 1.0682934522628784, + "learning_rate": 2.313920912646497e-05, + "loss": 0.701, + "step": 16709 + }, + { + "epoch": 0.2458285147860565, + "grad_norm": 1.1116893291473389, + "learning_rate": 2.305547032172643e-05, + "loss": 0.7038, + "step": 16740 + }, + { + "epoch": 0.24628375277640105, + "grad_norm": 1.0376949310302734, + "learning_rate": 2.2971753460741014e-05, + "loss": 0.6998, + "step": 16771 + }, + { + "epoch": 0.2467389907667456, + "grad_norm": 1.0389093160629272, + "learning_rate": 2.288805948824212e-05, + "loss": 0.7043, + "step": 16802 + }, + { + "epoch": 0.24719422875709016, + "grad_norm": 1.0645474195480347, + "learning_rate": 2.2804389348704858e-05, + "loss": 0.6947, + "step": 16833 + }, + { + "epoch": 0.2476494667474347, + "grad_norm": 1.0893995761871338, + "learning_rate": 2.2720743986335374e-05, + "loss": 0.6981, + "step": 16864 + }, + { + "epoch": 0.24810470473777924, + "grad_norm": 1.022275447845459, + "learning_rate": 2.2637124345060233e-05, + "loss": 0.7081, + "step": 16895 + }, + { + "epoch": 0.2485599427281238, + "grad_norm": 1.1055867671966553, + "learning_rate": 2.2553531368515695e-05, + "loss": 0.6942, + "step": 16926 + }, + { + "epoch": 0.24901518071846834, + "grad_norm": 1.0815192461013794, + "learning_rate": 2.2469966000037144e-05, + "loss": 0.6951, + "step": 16957 + }, + { + "epoch": 0.2494704187088129, + "grad_norm": 1.0612388849258423, + "learning_rate": 2.2386429182648417e-05, + "loss": 0.6991, + "step": 16988 + }, + { + "epoch": 0.24992565669915745, + "grad_norm": 1.0434961318969727, + "learning_rate": 2.230292185905114e-05, + "loss": 0.6904, + "step": 17019 + }, + { + "epoch": 0.250380894689502, + "grad_norm": 1.0427175760269165, + "learning_rate": 2.2219444971614116e-05, + "loss": 0.6979, + "step": 17050 + }, + { + "epoch": 0.25083613267984656, + "grad_norm": 1.0715687274932861, + "learning_rate": 2.2135999462362655e-05, + "loss": 0.7034, + "step": 17081 + }, + { + "epoch": 0.2512913706701911, + "grad_norm": 1.0116679668426514, + "learning_rate": 2.2052586272968003e-05, + "loss": 0.6996, + "step": 17112 + }, + { + "epoch": 0.25174660866053566, + "grad_norm": 1.2103781700134277, + "learning_rate": 2.196920634473666e-05, + "loss": 0.7026, + "step": 17143 + }, + { + "epoch": 0.2522018466508802, + "grad_norm": 1.0434819459915161, + "learning_rate": 2.1885860618599787e-05, + "loss": 0.6979, + "step": 17174 + }, + { + "epoch": 0.2526570846412247, + "grad_norm": 1.2911967039108276, + "learning_rate": 2.1802550035102577e-05, + "loss": 0.6866, + "step": 17205 + }, + { + "epoch": 0.2531123226315693, + "grad_norm": 1.1720303297042847, + "learning_rate": 2.171927553439363e-05, + "loss": 0.6868, + "step": 17236 + }, + { + "epoch": 0.2535675606219138, + "grad_norm": 1.0302678346633911, + "learning_rate": 2.1636038056214376e-05, + "loss": 0.691, + "step": 17267 + }, + { + "epoch": 0.2540227986122584, + "grad_norm": 1.0190601348876953, + "learning_rate": 2.155283853988844e-05, + "loss": 0.6964, + "step": 17298 + }, + { + "epoch": 0.25447803660260293, + "grad_norm": 1.109703540802002, + "learning_rate": 2.146967792431106e-05, + "loss": 0.693, + "step": 17329 + }, + { + "epoch": 0.25493327459294746, + "grad_norm": 1.160040020942688, + "learning_rate": 2.138655714793849e-05, + "loss": 0.6943, + "step": 17360 + }, + { + "epoch": 0.25538851258329204, + "grad_norm": 1.083268404006958, + "learning_rate": 2.1303477148777367e-05, + "loss": 0.7024, + "step": 17391 + }, + { + "epoch": 0.25584375057363656, + "grad_norm": 1.0631040334701538, + "learning_rate": 2.122043886437421e-05, + "loss": 0.6896, + "step": 17422 + }, + { + "epoch": 0.25629898856398114, + "grad_norm": 1.2141170501708984, + "learning_rate": 2.1137443231804765e-05, + "loss": 0.7005, + "step": 17453 + }, + { + "epoch": 0.25675422655432567, + "grad_norm": 1.082511067390442, + "learning_rate": 2.105449118766347e-05, + "loss": 0.6906, + "step": 17484 + }, + { + "epoch": 0.25720946454467025, + "grad_norm": 0.9919353127479553, + "learning_rate": 2.097158366805287e-05, + "loss": 0.7017, + "step": 17515 + }, + { + "epoch": 0.2576647025350148, + "grad_norm": 1.0450084209442139, + "learning_rate": 2.0888721608573047e-05, + "loss": 0.696, + "step": 17546 + }, + { + "epoch": 0.2581199405253593, + "grad_norm": 1.0460536479949951, + "learning_rate": 2.0805905944311087e-05, + "loss": 0.6947, + "step": 17577 + }, + { + "epoch": 0.2585751785157039, + "grad_norm": 1.0822510719299316, + "learning_rate": 2.0723137609830497e-05, + "loss": 0.7039, + "step": 17608 + }, + { + "epoch": 0.2590304165060484, + "grad_norm": 1.0411216020584106, + "learning_rate": 2.0640417539160686e-05, + "loss": 0.6962, + "step": 17639 + }, + { + "epoch": 0.259485654496393, + "grad_norm": 1.0115315914154053, + "learning_rate": 2.0557746665786427e-05, + "loss": 0.6954, + "step": 17670 + }, + { + "epoch": 0.2599408924867375, + "grad_norm": 1.0552514791488647, + "learning_rate": 2.0475125922637256e-05, + "loss": 0.6881, + "step": 17701 + }, + { + "epoch": 0.26039613047708204, + "grad_norm": 0.9966985583305359, + "learning_rate": 2.0392556242077047e-05, + "loss": 0.7012, + "step": 17732 + }, + { + "epoch": 0.2608513684674266, + "grad_norm": 1.113692045211792, + "learning_rate": 2.031003855589343e-05, + "loss": 0.703, + "step": 17763 + }, + { + "epoch": 0.26130660645777115, + "grad_norm": 1.0169728994369507, + "learning_rate": 2.022757379528727e-05, + "loss": 0.7008, + "step": 17794 + }, + { + "epoch": 0.26176184444811573, + "grad_norm": 1.1313414573669434, + "learning_rate": 2.0145162890862184e-05, + "loss": 0.6956, + "step": 17825 + }, + { + "epoch": 0.26221708243846026, + "grad_norm": 0.9456464052200317, + "learning_rate": 2.0062806772614022e-05, + "loss": 0.694, + "step": 17856 + }, + { + "epoch": 0.26267232042880484, + "grad_norm": 1.0825542211532593, + "learning_rate": 1.9980506369920392e-05, + "loss": 0.6915, + "step": 17887 + }, + { + "epoch": 0.26312755841914937, + "grad_norm": 1.059581995010376, + "learning_rate": 1.989826261153015e-05, + "loss": 0.6922, + "step": 17918 + }, + { + "epoch": 0.2635827964094939, + "grad_norm": 1.0134432315826416, + "learning_rate": 1.9816076425552923e-05, + "loss": 0.6952, + "step": 17949 + }, + { + "epoch": 0.2640380343998385, + "grad_norm": 0.9800439476966858, + "learning_rate": 1.9733948739448676e-05, + "loss": 0.7036, + "step": 17980 + }, + { + "epoch": 0.264493272390183, + "grad_norm": 1.128818392753601, + "learning_rate": 1.9651880480017155e-05, + "loss": 0.6916, + "step": 18011 + }, + { + "epoch": 0.2649485103805276, + "grad_norm": 1.0002161264419556, + "learning_rate": 1.9569872573387516e-05, + "loss": 0.6886, + "step": 18042 + }, + { + "epoch": 0.2654037483708721, + "grad_norm": 1.1037601232528687, + "learning_rate": 1.9487925945007854e-05, + "loss": 0.6954, + "step": 18073 + }, + { + "epoch": 0.2658589863612167, + "grad_norm": 1.0204657316207886, + "learning_rate": 1.9406041519634726e-05, + "loss": 0.6976, + "step": 18104 + }, + { + "epoch": 0.2663142243515612, + "grad_norm": 1.0254517793655396, + "learning_rate": 1.932422022132275e-05, + "loss": 0.697, + "step": 18135 + }, + { + "epoch": 0.26676946234190574, + "grad_norm": 1.0792242288589478, + "learning_rate": 1.924246297341414e-05, + "loss": 0.6932, + "step": 18166 + }, + { + "epoch": 0.2672247003322503, + "grad_norm": 1.2440094947814941, + "learning_rate": 1.9160770698528338e-05, + "loss": 0.6925, + "step": 18197 + }, + { + "epoch": 0.26767993832259485, + "grad_norm": 1.0181853771209717, + "learning_rate": 1.907914431855156e-05, + "loss": 0.6854, + "step": 18228 + }, + { + "epoch": 0.2681351763129394, + "grad_norm": 0.982681930065155, + "learning_rate": 1.8997584754626412e-05, + "loss": 0.6892, + "step": 18259 + }, + { + "epoch": 0.26859041430328395, + "grad_norm": 1.1587820053100586, + "learning_rate": 1.8916092927141486e-05, + "loss": 0.6955, + "step": 18290 + }, + { + "epoch": 0.2690456522936285, + "grad_norm": 1.0297470092773438, + "learning_rate": 1.883466975572098e-05, + "loss": 0.6921, + "step": 18321 + }, + { + "epoch": 0.26950089028397306, + "grad_norm": 1.0646672248840332, + "learning_rate": 1.8753316159214312e-05, + "loss": 0.6966, + "step": 18352 + }, + { + "epoch": 0.2699561282743176, + "grad_norm": 1.0070273876190186, + "learning_rate": 1.8672033055685766e-05, + "loss": 0.7005, + "step": 18383 + }, + { + "epoch": 0.27041136626466217, + "grad_norm": 0.9793278574943542, + "learning_rate": 1.8590821362404116e-05, + "loss": 0.6894, + "step": 18414 + }, + { + "epoch": 0.2708666042550067, + "grad_norm": 1.0349115133285522, + "learning_rate": 1.8509681995832294e-05, + "loss": 0.6906, + "step": 18445 + }, + { + "epoch": 0.2713218422453513, + "grad_norm": 1.0271046161651611, + "learning_rate": 1.8428615871617004e-05, + "loss": 0.6916, + "step": 18476 + }, + { + "epoch": 0.2717770802356958, + "grad_norm": 0.9766640663146973, + "learning_rate": 1.8347623904578448e-05, + "loss": 0.692, + "step": 18507 + }, + { + "epoch": 0.2722323182260403, + "grad_norm": 1.0498918294906616, + "learning_rate": 1.8266707008699975e-05, + "loss": 0.687, + "step": 18538 + }, + { + "epoch": 0.2726875562163849, + "grad_norm": 0.970116138458252, + "learning_rate": 1.818586609711774e-05, + "loss": 0.6923, + "step": 18569 + }, + { + "epoch": 0.27314279420672943, + "grad_norm": 1.1822494268417358, + "learning_rate": 1.8105102082110462e-05, + "loss": 0.6899, + "step": 18600 + }, + { + "epoch": 0.273598032197074, + "grad_norm": 1.0538249015808105, + "learning_rate": 1.8024415875089058e-05, + "loss": 0.6912, + "step": 18631 + }, + { + "epoch": 0.27405327018741854, + "grad_norm": 1.123678207397461, + "learning_rate": 1.7943808386586407e-05, + "loss": 0.6918, + "step": 18662 + }, + { + "epoch": 0.27450850817776307, + "grad_norm": 1.0302077531814575, + "learning_rate": 1.7863280526247073e-05, + "loss": 0.6872, + "step": 18693 + }, + { + "epoch": 0.27496374616810765, + "grad_norm": 1.0867012739181519, + "learning_rate": 1.7782833202817003e-05, + "loss": 0.7006, + "step": 18724 + }, + { + "epoch": 0.2754189841584522, + "grad_norm": 1.0516695976257324, + "learning_rate": 1.7702467324133327e-05, + "loss": 0.6969, + "step": 18755 + }, + { + "epoch": 0.27587422214879675, + "grad_norm": 1.083567500114441, + "learning_rate": 1.7622183797114042e-05, + "loss": 0.6936, + "step": 18786 + }, + { + "epoch": 0.2763294601391413, + "grad_norm": 1.0399643182754517, + "learning_rate": 1.7541983527747838e-05, + "loss": 0.6887, + "step": 18817 + }, + { + "epoch": 0.27678469812948586, + "grad_norm": 1.1514192819595337, + "learning_rate": 1.746186742108387e-05, + "loss": 0.6882, + "step": 18848 + }, + { + "epoch": 0.2772399361198304, + "grad_norm": 1.1234108209609985, + "learning_rate": 1.73818363812215e-05, + "loss": 0.6909, + "step": 18879 + }, + { + "epoch": 0.2776951741101749, + "grad_norm": 1.0432260036468506, + "learning_rate": 1.7301891311300153e-05, + "loss": 0.6826, + "step": 18910 + }, + { + "epoch": 0.2781504121005195, + "grad_norm": 1.2708081007003784, + "learning_rate": 1.7222033113489055e-05, + "loss": 0.694, + "step": 18941 + }, + { + "epoch": 0.278605650090864, + "grad_norm": 0.9991064667701721, + "learning_rate": 1.7142262688977127e-05, + "loss": 0.7001, + "step": 18972 + }, + { + "epoch": 0.2790608880812086, + "grad_norm": 1.103553295135498, + "learning_rate": 1.7062580937962764e-05, + "loss": 0.6974, + "step": 19003 + }, + { + "epoch": 0.27951612607155313, + "grad_norm": 1.0002790689468384, + "learning_rate": 1.698298875964369e-05, + "loss": 0.6951, + "step": 19034 + }, + { + "epoch": 0.27997136406189765, + "grad_norm": 1.0627328157424927, + "learning_rate": 1.690348705220684e-05, + "loss": 0.6914, + "step": 19065 + }, + { + "epoch": 0.28042660205224224, + "grad_norm": 1.152733325958252, + "learning_rate": 1.6824076712818156e-05, + "loss": 0.6909, + "step": 19096 + }, + { + "epoch": 0.28088184004258676, + "grad_norm": 1.1142559051513672, + "learning_rate": 1.6744758637612533e-05, + "loss": 0.6878, + "step": 19127 + }, + { + "epoch": 0.28133707803293134, + "grad_norm": 1.022026538848877, + "learning_rate": 1.6665533721683664e-05, + "loss": 0.6876, + "step": 19158 + }, + { + "epoch": 0.28179231602327587, + "grad_norm": 1.117065668106079, + "learning_rate": 1.6586402859073974e-05, + "loss": 0.6878, + "step": 19189 + }, + { + "epoch": 0.28224755401362045, + "grad_norm": 0.9499729871749878, + "learning_rate": 1.6507366942764463e-05, + "loss": 0.6888, + "step": 19220 + }, + { + "epoch": 0.282702792003965, + "grad_norm": 1.111111044883728, + "learning_rate": 1.6428426864664732e-05, + "loss": 0.6898, + "step": 19251 + }, + { + "epoch": 0.2831580299943095, + "grad_norm": 1.1620928049087524, + "learning_rate": 1.6349583515602816e-05, + "loss": 0.6948, + "step": 19282 + }, + { + "epoch": 0.2836132679846541, + "grad_norm": 1.1431219577789307, + "learning_rate": 1.6270837785315208e-05, + "loss": 0.6929, + "step": 19313 + }, + { + "epoch": 0.2840685059749986, + "grad_norm": 1.1274683475494385, + "learning_rate": 1.619219056243676e-05, + "loss": 0.6823, + "step": 19344 + }, + { + "epoch": 0.2845237439653432, + "grad_norm": 1.1499154567718506, + "learning_rate": 1.6113642734490698e-05, + "loss": 0.6838, + "step": 19375 + }, + { + "epoch": 0.2849789819556877, + "grad_norm": 1.0493180751800537, + "learning_rate": 1.6035195187878577e-05, + "loss": 0.6867, + "step": 19406 + }, + { + "epoch": 0.2854342199460323, + "grad_norm": 0.9728123545646667, + "learning_rate": 1.5956848807870305e-05, + "loss": 0.6889, + "step": 19437 + }, + { + "epoch": 0.2858894579363768, + "grad_norm": 1.0137308835983276, + "learning_rate": 1.587860447859413e-05, + "loss": 0.6892, + "step": 19468 + }, + { + "epoch": 0.28634469592672135, + "grad_norm": 1.0865050554275513, + "learning_rate": 1.5800463083026686e-05, + "loss": 0.6841, + "step": 19499 + }, + { + "epoch": 0.28679993391706593, + "grad_norm": 1.0522550344467163, + "learning_rate": 1.572242550298298e-05, + "loss": 0.6905, + "step": 19530 + }, + { + "epoch": 0.28725517190741046, + "grad_norm": 1.1563197374343872, + "learning_rate": 1.56444926191065e-05, + "loss": 0.6811, + "step": 19561 + }, + { + "epoch": 0.28771040989775504, + "grad_norm": 0.962688684463501, + "learning_rate": 1.5566665310859257e-05, + "loss": 0.6898, + "step": 19592 + }, + { + "epoch": 0.28816564788809956, + "grad_norm": 1.0998531579971313, + "learning_rate": 1.5488944456511846e-05, + "loss": 0.6909, + "step": 19623 + }, + { + "epoch": 0.2886208858784441, + "grad_norm": 1.1609821319580078, + "learning_rate": 1.5411330933133546e-05, + "loss": 0.6844, + "step": 19654 + }, + { + "epoch": 0.28907612386878867, + "grad_norm": 0.9745819568634033, + "learning_rate": 1.533382561658241e-05, + "loss": 0.6933, + "step": 19685 + }, + { + "epoch": 0.2895313618591332, + "grad_norm": 1.085925817489624, + "learning_rate": 1.525642938149541e-05, + "loss": 0.6894, + "step": 19716 + }, + { + "epoch": 0.2899865998494778, + "grad_norm": 1.0314606428146362, + "learning_rate": 1.5179143101278536e-05, + "loss": 0.6965, + "step": 19747 + }, + { + "epoch": 0.2904418378398223, + "grad_norm": 1.0771900415420532, + "learning_rate": 1.5101967648096955e-05, + "loss": 0.6904, + "step": 19778 + }, + { + "epoch": 0.2908970758301669, + "grad_norm": 0.9729062914848328, + "learning_rate": 1.5024903892865172e-05, + "loss": 0.6886, + "step": 19809 + }, + { + "epoch": 0.2913523138205114, + "grad_norm": 1.0824676752090454, + "learning_rate": 1.4947952705237184e-05, + "loss": 0.6804, + "step": 19840 + }, + { + "epoch": 0.29180755181085594, + "grad_norm": 1.0260144472122192, + "learning_rate": 1.4871114953596682e-05, + "loss": 0.6905, + "step": 19871 + }, + { + "epoch": 0.2922627898012005, + "grad_norm": 0.9324101209640503, + "learning_rate": 1.4794391505047256e-05, + "loss": 0.6877, + "step": 19902 + }, + { + "epoch": 0.29271802779154504, + "grad_norm": 1.0553687810897827, + "learning_rate": 1.4717783225402596e-05, + "loss": 0.6928, + "step": 19933 + }, + { + "epoch": 0.2931732657818896, + "grad_norm": 1.129400610923767, + "learning_rate": 1.4641290979176735e-05, + "loss": 0.6905, + "step": 19964 + }, + { + "epoch": 0.29362850377223415, + "grad_norm": 1.064041018486023, + "learning_rate": 1.4564915629574246e-05, + "loss": 0.6936, + "step": 19995 + }, + { + "epoch": 0.2940837417625787, + "grad_norm": 1.116929292678833, + "learning_rate": 1.4488658038480601e-05, + "loss": 0.6818, + "step": 20026 + }, + { + "epoch": 0.29453897975292326, + "grad_norm": 1.0334928035736084, + "learning_rate": 1.4412519066452323e-05, + "loss": 0.6887, + "step": 20057 + }, + { + "epoch": 0.2949942177432678, + "grad_norm": 1.0690734386444092, + "learning_rate": 1.4336499572707373e-05, + "loss": 0.6885, + "step": 20088 + }, + { + "epoch": 0.29544945573361237, + "grad_norm": 1.1211203336715698, + "learning_rate": 1.4260600415115433e-05, + "loss": 0.6919, + "step": 20119 + }, + { + "epoch": 0.2959046937239569, + "grad_norm": 0.9984875917434692, + "learning_rate": 1.4184822450188137e-05, + "loss": 0.6892, + "step": 20150 + }, + { + "epoch": 0.29635993171430147, + "grad_norm": 1.0159475803375244, + "learning_rate": 1.410916653306954e-05, + "loss": 0.682, + "step": 20181 + }, + { + "epoch": 0.296815169704646, + "grad_norm": 0.9778633117675781, + "learning_rate": 1.403363351752639e-05, + "loss": 0.6808, + "step": 20212 + }, + { + "epoch": 0.2972704076949905, + "grad_norm": 1.1207058429718018, + "learning_rate": 1.3958224255938485e-05, + "loss": 0.6852, + "step": 20243 + }, + { + "epoch": 0.2977256456853351, + "grad_norm": 1.0286227464675903, + "learning_rate": 1.388293959928911e-05, + "loss": 0.6886, + "step": 20274 + }, + { + "epoch": 0.29818088367567963, + "grad_norm": 1.0112954378128052, + "learning_rate": 1.3807780397155379e-05, + "loss": 0.6878, + "step": 20305 + }, + { + "epoch": 0.2986361216660242, + "grad_norm": 1.0683724880218506, + "learning_rate": 1.3732747497698655e-05, + "loss": 0.6889, + "step": 20336 + }, + { + "epoch": 0.29909135965636874, + "grad_norm": 1.0744072198867798, + "learning_rate": 1.3657841747655038e-05, + "loss": 0.6791, + "step": 20367 + }, + { + "epoch": 0.2995465976467133, + "grad_norm": 1.0279752016067505, + "learning_rate": 1.3583063992325706e-05, + "loss": 0.684, + "step": 20398 + }, + { + "epoch": 0.30000183563705785, + "grad_norm": 0.9995334148406982, + "learning_rate": 1.3508415075567496e-05, + "loss": 0.6906, + "step": 20429 + }, + { + "epoch": 0.30045707362740237, + "grad_norm": 1.351607322692871, + "learning_rate": 1.343389583978327e-05, + "loss": 0.6964, + "step": 20460 + }, + { + "epoch": 0.30091231161774695, + "grad_norm": 1.0838359594345093, + "learning_rate": 1.3359507125912468e-05, + "loss": 0.6784, + "step": 20491 + }, + { + "epoch": 0.3013675496080915, + "grad_norm": 1.0536307096481323, + "learning_rate": 1.3285249773421627e-05, + "loss": 0.6872, + "step": 20522 + }, + { + "epoch": 0.30182278759843606, + "grad_norm": 0.9636529088020325, + "learning_rate": 1.3211124620294884e-05, + "loss": 0.6914, + "step": 20553 + }, + { + "epoch": 0.3022780255887806, + "grad_norm": 1.1852017641067505, + "learning_rate": 1.313713250302451e-05, + "loss": 0.6821, + "step": 20584 + }, + { + "epoch": 0.3027332635791251, + "grad_norm": 1.072434425354004, + "learning_rate": 1.3063274256601479e-05, + "loss": 0.695, + "step": 20615 + }, + { + "epoch": 0.3031885015694697, + "grad_norm": 1.2345269918441772, + "learning_rate": 1.2989550714506086e-05, + "loss": 0.6824, + "step": 20646 + }, + { + "epoch": 0.3036437395598142, + "grad_norm": 1.0516636371612549, + "learning_rate": 1.291596270869846e-05, + "loss": 0.6854, + "step": 20677 + }, + { + "epoch": 0.3040989775501588, + "grad_norm": 1.0413544178009033, + "learning_rate": 1.284251106960927e-05, + "loss": 0.6895, + "step": 20708 + }, + { + "epoch": 0.3045542155405033, + "grad_norm": 1.158065676689148, + "learning_rate": 1.2769196626130263e-05, + "loss": 0.6861, + "step": 20739 + }, + { + "epoch": 0.3050094535308479, + "grad_norm": 1.0109269618988037, + "learning_rate": 1.2696020205604969e-05, + "loss": 0.6898, + "step": 20770 + }, + { + "epoch": 0.30546469152119243, + "grad_norm": 0.9886858463287354, + "learning_rate": 1.2622982633819359e-05, + "loss": 0.6874, + "step": 20801 + }, + { + "epoch": 0.30591992951153696, + "grad_norm": 1.0234347581863403, + "learning_rate": 1.2550084734992484e-05, + "loss": 0.6823, + "step": 20832 + }, + { + "epoch": 0.30637516750188154, + "grad_norm": 1.028950810432434, + "learning_rate": 1.247732733176724e-05, + "loss": 0.6884, + "step": 20863 + }, + { + "epoch": 0.30683040549222607, + "grad_norm": 1.1941654682159424, + "learning_rate": 1.2404711245201044e-05, + "loss": 0.7019, + "step": 20894 + }, + { + "epoch": 0.30728564348257065, + "grad_norm": 1.0201176404953003, + "learning_rate": 1.2332237294756535e-05, + "loss": 0.6846, + "step": 20925 + }, + { + "epoch": 0.3077408814729152, + "grad_norm": 0.9765841364860535, + "learning_rate": 1.225990629829241e-05, + "loss": 0.6881, + "step": 20956 + }, + { + "epoch": 0.3081961194632597, + "grad_norm": 1.0036793947219849, + "learning_rate": 1.2187719072054136e-05, + "loss": 0.6849, + "step": 20987 + }, + { + "epoch": 0.3086513574536043, + "grad_norm": 1.1151163578033447, + "learning_rate": 1.2115676430664735e-05, + "loss": 0.6825, + "step": 21018 + }, + { + "epoch": 0.3091065954439488, + "grad_norm": 1.0734307765960693, + "learning_rate": 1.2043779187115647e-05, + "loss": 0.6902, + "step": 21049 + }, + { + "epoch": 0.3095618334342934, + "grad_norm": 0.9811964631080627, + "learning_rate": 1.1972028152757476e-05, + "loss": 0.6883, + "step": 21080 + }, + { + "epoch": 0.3100170714246379, + "grad_norm": 1.0949833393096924, + "learning_rate": 1.1900424137290889e-05, + "loss": 0.6873, + "step": 21111 + }, + { + "epoch": 0.3104723094149825, + "grad_norm": 1.0459587574005127, + "learning_rate": 1.1828967948757482e-05, + "loss": 0.6853, + "step": 21142 + }, + { + "epoch": 0.310927547405327, + "grad_norm": 1.1628592014312744, + "learning_rate": 1.175766039353062e-05, + "loss": 0.6837, + "step": 21173 + }, + { + "epoch": 0.31138278539567155, + "grad_norm": 0.9916526079177856, + "learning_rate": 1.1686502276306382e-05, + "loss": 0.6838, + "step": 21204 + }, + { + "epoch": 0.3118380233860161, + "grad_norm": 0.9945309162139893, + "learning_rate": 1.1615494400094445e-05, + "loss": 0.6811, + "step": 21235 + }, + { + "epoch": 0.31229326137636065, + "grad_norm": 1.0234261751174927, + "learning_rate": 1.1544637566209029e-05, + "loss": 0.6833, + "step": 21266 + }, + { + "epoch": 0.31274849936670523, + "grad_norm": 0.999071478843689, + "learning_rate": 1.1473932574259886e-05, + "loss": 0.6858, + "step": 21297 + }, + { + "epoch": 0.31320373735704976, + "grad_norm": 1.0478752851486206, + "learning_rate": 1.1403380222143247e-05, + "loss": 0.6918, + "step": 21328 + }, + { + "epoch": 0.3136589753473943, + "grad_norm": 1.083009958267212, + "learning_rate": 1.1332981306032808e-05, + "loss": 0.6758, + "step": 21359 + } + ], + "logging_steps": 31, + "max_steps": 30517, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 3052, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.5761592551926137e+19, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-21364/training_args.bin b/checkpoint-21364/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..dd0c30645908fd59ad9d6091c5e2e08018856b49 --- /dev/null +++ b/checkpoint-21364/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2d2ae676cb4770c8405bd6fc6962f1ef1289d669b20872c9a947682a2673db22 +size 5304 diff --git a/checkpoint-24416/config.json b/checkpoint-24416/config.json new file mode 100644 index 0000000000000000000000000000000000000000..1a2e378a7091e0c0c8eaa10d0bd7796835fe4b64 --- /dev/null +++ b/checkpoint-24416/config.json @@ -0,0 +1,39 @@ +{ + "_name_or_path": "meta-llama/Llama-3.1-8B-Instruct", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 8.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "torch_dtype": "float32", + "transformers_version": "4.44.0.dev0", + "use_cache": true, + "vocab_size": 128256 +} diff --git a/checkpoint-24416/generation_config.json b/checkpoint-24416/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c0d952a3697e90a24460d766a494a8bb9cf80534 --- /dev/null +++ b/checkpoint-24416/generation_config.json @@ -0,0 +1,12 @@ +{ + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.44.0.dev0" +} diff --git a/checkpoint-24416/model-00001-of-00007.safetensors b/checkpoint-24416/model-00001-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..acf5a5b82593d6846c0a25916e54eec5ee7febee --- /dev/null +++ b/checkpoint-24416/model-00001-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:49ec922d11f8cb27b9f37790e1427c952d2bf2b86f14df395570bb3395b798df +size 4886466168 diff --git a/checkpoint-24416/model-00002-of-00007.safetensors b/checkpoint-24416/model-00002-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..fbc8e97904ed5acdba6240901a52d7ea3c73628d --- /dev/null +++ b/checkpoint-24416/model-00002-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:23fa5aaf8ac56c17c821980c9aee44fcbc2a617fc570e875e6fdb93d92886c72 +size 4832007448 diff --git a/checkpoint-24416/model-00003-of-00007.safetensors b/checkpoint-24416/model-00003-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..aeab8b855338147792249baf9c680bfb29ea42a5 --- /dev/null +++ b/checkpoint-24416/model-00003-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a223e87240e50dfb05634ea909cf1ec715d37a12b681b14a3a6e97f28688f2b +size 4999813112 diff --git a/checkpoint-24416/model-00004-of-00007.safetensors b/checkpoint-24416/model-00004-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ac9e7b6b5edbe63eec2dd4c7d3310db1f10fdda9 --- /dev/null +++ b/checkpoint-24416/model-00004-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3047d199c611c6d12b1cca780f033ad89df9c076f742674f7ac4a892ca9573b3 +size 4999813128 diff --git a/checkpoint-24416/model-00005-of-00007.safetensors b/checkpoint-24416/model-00005-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..48718503a952853ffd5e231b9dba74b7026e0e01 --- /dev/null +++ b/checkpoint-24416/model-00005-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a2f91d749d664ecd22c71f3d54488966fc6899a03924b8fea3613c8f83398a8 +size 4832007496 diff --git a/checkpoint-24416/model-00006-of-00007.safetensors b/checkpoint-24416/model-00006-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..4985a75fabf35a7001e5466d1c4dc50ab3cdf80f --- /dev/null +++ b/checkpoint-24416/model-00006-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:00f967f92331ef41b6b91090d4a0f77a0bcef4b624a5e5cc22319150c96d0685 +size 4999813120 diff --git a/checkpoint-24416/model-00007-of-00007.safetensors b/checkpoint-24416/model-00007-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..9a5906fe3b693287bf5aa7daffb35bc3f17bc3e8 --- /dev/null +++ b/checkpoint-24416/model-00007-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:499121e194a4354ce6f4914d765cd8128790b75757ef359c3ba72f3809e062b3 +size 2571158184 diff --git a/checkpoint-24416/model.safetensors.index.json b/checkpoint-24416/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..0985084320f65a678bea622edb50d3771af4fd13 --- /dev/null +++ b/checkpoint-24416/model.safetensors.index.json @@ -0,0 +1,298 @@ +{ + "metadata": { + "total_size": 32121044992 + }, + "weight_map": { + "lm_head.weight": "model-00007-of-00007.safetensors", + "model.embed_tokens.weight": "model-00001-of-00007.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.10.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.15.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.20.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.21.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.26.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.3.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.30.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.input_layernorm.weight": "model-00007-of-00007.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00007-of-00007.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00007-of-00007.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00007-of-00007.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.4.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.9.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.norm.weight": "model-00007-of-00007.safetensors" + } +} diff --git a/checkpoint-24416/optimizer.pt b/checkpoint-24416/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..ce842abf9c6e68d12028742084e15e9d4a835452 --- /dev/null +++ b/checkpoint-24416/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d59f96eb71613b7fdc4d519f90d41b4584ae9c2055b9d393ba5b47771bcd3928 +size 15385036334 diff --git a/checkpoint-24416/rng_state.pth b/checkpoint-24416/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..897ad19948758b40c785c678c136c4029433a27a --- /dev/null +++ b/checkpoint-24416/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d9cd6a0487226e5bd30d1846894c82af483733ab4381b75bae9c0745e05d405 +size 14244 diff --git a/checkpoint-24416/scheduler.pt b/checkpoint-24416/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..c42ab2446b20c095538f06fcf92f01ac58007a07 --- /dev/null +++ b/checkpoint-24416/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:719f421c0e2563868e52a38d7c300a4ceee2dbf15648505f514dae6bb8a5e723 +size 1064 diff --git a/checkpoint-24416/trainer_state.json b/checkpoint-24416/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..0a047e67666d7801e64b23f03eb3e90cc8dfc70f --- /dev/null +++ b/checkpoint-24416/trainer_state.json @@ -0,0 +1,5542 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.35855131523395195, + "eval_steps": 500, + "global_step": 24416, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0004552379903445491, + "grad_norm": 4.0020222663879395, + "learning_rate": 1.0157273918741808e-06, + "loss": 0.92, + "step": 31 + }, + { + "epoch": 0.0009104759806890982, + "grad_norm": 3.06746506690979, + "learning_rate": 2.0314547837483616e-06, + "loss": 0.8898, + "step": 62 + }, + { + "epoch": 0.0013657139710336473, + "grad_norm": 3.004636764526367, + "learning_rate": 3.0471821756225426e-06, + "loss": 0.8653, + "step": 93 + }, + { + "epoch": 0.0018209519613781964, + "grad_norm": 3.0733370780944824, + "learning_rate": 4.062909567496723e-06, + "loss": 0.8516, + "step": 124 + }, + { + "epoch": 0.0022761899517227454, + "grad_norm": 2.898719549179077, + "learning_rate": 5.078636959370905e-06, + "loss": 0.8461, + "step": 155 + }, + { + "epoch": 0.0027314279420672946, + "grad_norm": 2.8273258209228516, + "learning_rate": 6.094364351245085e-06, + "loss": 0.8362, + "step": 186 + }, + { + "epoch": 0.0031866659324118435, + "grad_norm": 2.9983041286468506, + "learning_rate": 7.110091743119267e-06, + "loss": 0.8274, + "step": 217 + }, + { + "epoch": 0.0036419039227563927, + "grad_norm": 2.9561767578125, + "learning_rate": 8.125819134993446e-06, + "loss": 0.8373, + "step": 248 + }, + { + "epoch": 0.0040971419131009415, + "grad_norm": 2.732726812362671, + "learning_rate": 9.141546526867629e-06, + "loss": 0.8342, + "step": 279 + }, + { + "epoch": 0.004552379903445491, + "grad_norm": 2.8236358165740967, + "learning_rate": 1.015727391874181e-05, + "loss": 0.8387, + "step": 310 + }, + { + "epoch": 0.00500761789379004, + "grad_norm": 2.8730266094207764, + "learning_rate": 1.117300131061599e-05, + "loss": 0.8328, + "step": 341 + }, + { + "epoch": 0.005462855884134589, + "grad_norm": 2.717439651489258, + "learning_rate": 1.218872870249017e-05, + "loss": 0.8316, + "step": 372 + }, + { + "epoch": 0.005918093874479138, + "grad_norm": 2.9625961780548096, + "learning_rate": 1.3204456094364351e-05, + "loss": 0.8295, + "step": 403 + }, + { + "epoch": 0.006373331864823687, + "grad_norm": 2.6649422645568848, + "learning_rate": 1.4220183486238533e-05, + "loss": 0.8356, + "step": 434 + }, + { + "epoch": 0.006828569855168236, + "grad_norm": 2.9066264629364014, + "learning_rate": 1.5235910878112714e-05, + "loss": 0.8313, + "step": 465 + }, + { + "epoch": 0.0072838078455127854, + "grad_norm": 2.7710695266723633, + "learning_rate": 1.6251638269986893e-05, + "loss": 0.8251, + "step": 496 + }, + { + "epoch": 0.007739045835857335, + "grad_norm": 2.416724443435669, + "learning_rate": 1.7267365661861077e-05, + "loss": 0.8319, + "step": 527 + }, + { + "epoch": 0.008194283826201883, + "grad_norm": 2.583005905151367, + "learning_rate": 1.8283093053735257e-05, + "loss": 0.8265, + "step": 558 + }, + { + "epoch": 0.008649521816546433, + "grad_norm": 2.6601760387420654, + "learning_rate": 1.9298820445609438e-05, + "loss": 0.8328, + "step": 589 + }, + { + "epoch": 0.009104759806890982, + "grad_norm": 2.9271795749664307, + "learning_rate": 2.031454783748362e-05, + "loss": 0.8282, + "step": 620 + }, + { + "epoch": 0.00955999779723553, + "grad_norm": 2.3631091117858887, + "learning_rate": 2.13302752293578e-05, + "loss": 0.8314, + "step": 651 + }, + { + "epoch": 0.01001523578758008, + "grad_norm": 2.4002573490142822, + "learning_rate": 2.234600262123198e-05, + "loss": 0.829, + "step": 682 + }, + { + "epoch": 0.010470473777924628, + "grad_norm": 2.629352331161499, + "learning_rate": 2.336173001310616e-05, + "loss": 0.8315, + "step": 713 + }, + { + "epoch": 0.010925711768269179, + "grad_norm": 2.3604094982147217, + "learning_rate": 2.437745740498034e-05, + "loss": 0.8338, + "step": 744 + }, + { + "epoch": 0.011380949758613727, + "grad_norm": 2.8708078861236572, + "learning_rate": 2.5393184796854525e-05, + "loss": 0.843, + "step": 775 + }, + { + "epoch": 0.011836187748958275, + "grad_norm": 2.7403945922851562, + "learning_rate": 2.6408912188728702e-05, + "loss": 0.8372, + "step": 806 + }, + { + "epoch": 0.012291425739302825, + "grad_norm": 2.733816623687744, + "learning_rate": 2.7424639580602886e-05, + "loss": 0.8327, + "step": 837 + }, + { + "epoch": 0.012746663729647374, + "grad_norm": 2.2656960487365723, + "learning_rate": 2.8440366972477066e-05, + "loss": 0.8357, + "step": 868 + }, + { + "epoch": 0.013201901719991924, + "grad_norm": 2.18038010597229, + "learning_rate": 2.9456094364351244e-05, + "loss": 0.8331, + "step": 899 + }, + { + "epoch": 0.013657139710336472, + "grad_norm": 2.497558832168579, + "learning_rate": 3.0471821756225428e-05, + "loss": 0.8271, + "step": 930 + }, + { + "epoch": 0.01411237770068102, + "grad_norm": 2.322632074356079, + "learning_rate": 3.148754914809961e-05, + "loss": 0.8327, + "step": 961 + }, + { + "epoch": 0.014567615691025571, + "grad_norm": 2.5596141815185547, + "learning_rate": 3.2503276539973785e-05, + "loss": 0.8367, + "step": 992 + }, + { + "epoch": 0.01502285368137012, + "grad_norm": 2.1262409687042236, + "learning_rate": 3.351900393184797e-05, + "loss": 0.8192, + "step": 1023 + }, + { + "epoch": 0.01547809167171467, + "grad_norm": 2.2483584880828857, + "learning_rate": 3.453473132372215e-05, + "loss": 0.8352, + "step": 1054 + }, + { + "epoch": 0.015933329662059216, + "grad_norm": 2.224043130874634, + "learning_rate": 3.555045871559633e-05, + "loss": 0.8362, + "step": 1085 + }, + { + "epoch": 0.016388567652403766, + "grad_norm": 2.104788303375244, + "learning_rate": 3.6566186107470514e-05, + "loss": 0.832, + "step": 1116 + }, + { + "epoch": 0.016843805642748316, + "grad_norm": 2.126499652862549, + "learning_rate": 3.7581913499344695e-05, + "loss": 0.8341, + "step": 1147 + }, + { + "epoch": 0.017299043633092866, + "grad_norm": 2.221691370010376, + "learning_rate": 3.8597640891218876e-05, + "loss": 0.8346, + "step": 1178 + }, + { + "epoch": 0.017754281623437413, + "grad_norm": 2.4445159435272217, + "learning_rate": 3.9613368283093056e-05, + "loss": 0.8415, + "step": 1209 + }, + { + "epoch": 0.018209519613781963, + "grad_norm": 2.321516752243042, + "learning_rate": 4.062909567496724e-05, + "loss": 0.8289, + "step": 1240 + }, + { + "epoch": 0.018664757604126513, + "grad_norm": 2.2673656940460205, + "learning_rate": 4.164482306684142e-05, + "loss": 0.8307, + "step": 1271 + }, + { + "epoch": 0.01911999559447106, + "grad_norm": 2.1191513538360596, + "learning_rate": 4.26605504587156e-05, + "loss": 0.8319, + "step": 1302 + }, + { + "epoch": 0.01957523358481561, + "grad_norm": 2.18985652923584, + "learning_rate": 4.367627785058978e-05, + "loss": 0.8398, + "step": 1333 + }, + { + "epoch": 0.02003047157516016, + "grad_norm": 1.9449113607406616, + "learning_rate": 4.469200524246396e-05, + "loss": 0.8304, + "step": 1364 + }, + { + "epoch": 0.020485709565504707, + "grad_norm": 2.0119357109069824, + "learning_rate": 4.570773263433814e-05, + "loss": 0.82, + "step": 1395 + }, + { + "epoch": 0.020940947555849257, + "grad_norm": 2.0013489723205566, + "learning_rate": 4.672346002621232e-05, + "loss": 0.8275, + "step": 1426 + }, + { + "epoch": 0.021396185546193807, + "grad_norm": 2.0175704956054688, + "learning_rate": 4.77391874180865e-05, + "loss": 0.8348, + "step": 1457 + }, + { + "epoch": 0.021851423536538357, + "grad_norm": 2.0555551052093506, + "learning_rate": 4.875491480996068e-05, + "loss": 0.8322, + "step": 1488 + }, + { + "epoch": 0.022306661526882904, + "grad_norm": 1.9871079921722412, + "learning_rate": 4.977064220183487e-05, + "loss": 0.8253, + "step": 1519 + }, + { + "epoch": 0.022761899517227454, + "grad_norm": 2.1991310119628906, + "learning_rate": 4.9999915451558777e-05, + "loss": 0.8252, + "step": 1550 + }, + { + "epoch": 0.023217137507572004, + "grad_norm": 1.96790611743927, + "learning_rate": 4.999955597496219e-05, + "loss": 0.8242, + "step": 1581 + }, + { + "epoch": 0.02367237549791655, + "grad_norm": 2.0572164058685303, + "learning_rate": 4.9998914381774255e-05, + "loss": 0.833, + "step": 1612 + }, + { + "epoch": 0.0241276134882611, + "grad_norm": 2.0381035804748535, + "learning_rate": 4.999799067923527e-05, + "loss": 0.8351, + "step": 1643 + }, + { + "epoch": 0.02458285147860565, + "grad_norm": 2.145235061645508, + "learning_rate": 4.999678487776908e-05, + "loss": 0.842, + "step": 1674 + }, + { + "epoch": 0.025038089468950198, + "grad_norm": 2.0120856761932373, + "learning_rate": 4.9995296990983006e-05, + "loss": 0.838, + "step": 1705 + }, + { + "epoch": 0.025493327459294748, + "grad_norm": 1.9461947679519653, + "learning_rate": 4.999352703566763e-05, + "loss": 0.8269, + "step": 1736 + }, + { + "epoch": 0.025948565449639298, + "grad_norm": 1.9189060926437378, + "learning_rate": 4.999147503179668e-05, + "loss": 0.8214, + "step": 1767 + }, + { + "epoch": 0.026403803439983848, + "grad_norm": 1.906604290008545, + "learning_rate": 4.998914100252672e-05, + "loss": 0.8249, + "step": 1798 + }, + { + "epoch": 0.026859041430328395, + "grad_norm": 2.0810904502868652, + "learning_rate": 4.998652497419696e-05, + "loss": 0.8143, + "step": 1829 + }, + { + "epoch": 0.027314279420672945, + "grad_norm": 1.9607690572738647, + "learning_rate": 4.9983626976328927e-05, + "loss": 0.8155, + "step": 1860 + }, + { + "epoch": 0.027769517411017495, + "grad_norm": 1.8492732048034668, + "learning_rate": 4.998044704162613e-05, + "loss": 0.8168, + "step": 1891 + }, + { + "epoch": 0.02822475540136204, + "grad_norm": 2.0616824626922607, + "learning_rate": 4.9976985205973705e-05, + "loss": 0.8279, + "step": 1922 + }, + { + "epoch": 0.02867999339170659, + "grad_norm": 1.882591724395752, + "learning_rate": 4.997324150843799e-05, + "loss": 0.8199, + "step": 1953 + }, + { + "epoch": 0.029135231382051142, + "grad_norm": 1.8001904487609863, + "learning_rate": 4.99692159912661e-05, + "loss": 0.8232, + "step": 1984 + }, + { + "epoch": 0.02959046937239569, + "grad_norm": 1.94768226146698, + "learning_rate": 4.996490869988546e-05, + "loss": 0.817, + "step": 2015 + }, + { + "epoch": 0.03004570736274024, + "grad_norm": 1.845452904701233, + "learning_rate": 4.996031968290326e-05, + "loss": 0.8146, + "step": 2046 + }, + { + "epoch": 0.03050094535308479, + "grad_norm": 1.8649152517318726, + "learning_rate": 4.995544899210594e-05, + "loss": 0.8246, + "step": 2077 + }, + { + "epoch": 0.03095618334342934, + "grad_norm": 1.665781021118164, + "learning_rate": 4.9950296682458583e-05, + "loss": 0.8004, + "step": 2108 + }, + { + "epoch": 0.031411421333773885, + "grad_norm": 1.6481060981750488, + "learning_rate": 4.994486281210429e-05, + "loss": 0.7949, + "step": 2139 + }, + { + "epoch": 0.03186665932411843, + "grad_norm": 1.7481175661087036, + "learning_rate": 4.9939147442363566e-05, + "loss": 0.8142, + "step": 2170 + }, + { + "epoch": 0.032321897314462986, + "grad_norm": 1.6130069494247437, + "learning_rate": 4.9933150637733574e-05, + "loss": 0.8126, + "step": 2201 + }, + { + "epoch": 0.03277713530480753, + "grad_norm": 1.7536373138427734, + "learning_rate": 4.992687246588743e-05, + "loss": 0.8149, + "step": 2232 + }, + { + "epoch": 0.033232373295152086, + "grad_norm": 1.721200942993164, + "learning_rate": 4.992031299767347e-05, + "loss": 0.8051, + "step": 2263 + }, + { + "epoch": 0.03368761128549663, + "grad_norm": 3.3350446224212646, + "learning_rate": 4.9913472307114386e-05, + "loss": 0.8007, + "step": 2294 + }, + { + "epoch": 0.03414284927584118, + "grad_norm": 1.848281741142273, + "learning_rate": 4.9906350471406446e-05, + "loss": 0.8076, + "step": 2325 + }, + { + "epoch": 0.03459808726618573, + "grad_norm": 1.6283164024353027, + "learning_rate": 4.989894757091861e-05, + "loss": 0.8084, + "step": 2356 + }, + { + "epoch": 0.03505332525653028, + "grad_norm": 1.6567096710205078, + "learning_rate": 4.989126368919158e-05, + "loss": 0.8075, + "step": 2387 + }, + { + "epoch": 0.035508563246874826, + "grad_norm": 1.6488817930221558, + "learning_rate": 4.988329891293693e-05, + "loss": 0.7994, + "step": 2418 + }, + { + "epoch": 0.03596380123721938, + "grad_norm": 1.9465230703353882, + "learning_rate": 4.987505333203608e-05, + "loss": 0.8064, + "step": 2449 + }, + { + "epoch": 0.036419039227563926, + "grad_norm": 1.722517967224121, + "learning_rate": 4.9866527039539276e-05, + "loss": 0.8005, + "step": 2480 + }, + { + "epoch": 0.03687427721790847, + "grad_norm": 1.8509281873703003, + "learning_rate": 4.9857720131664594e-05, + "loss": 0.8094, + "step": 2511 + }, + { + "epoch": 0.03732951520825303, + "grad_norm": 1.7140249013900757, + "learning_rate": 4.9848632707796773e-05, + "loss": 0.8018, + "step": 2542 + }, + { + "epoch": 0.03778475319859757, + "grad_norm": 1.9315160512924194, + "learning_rate": 4.9839264870486155e-05, + "loss": 0.8094, + "step": 2573 + }, + { + "epoch": 0.03823999118894212, + "grad_norm": 1.728063702583313, + "learning_rate": 4.9829616725447526e-05, + "loss": 0.8052, + "step": 2604 + }, + { + "epoch": 0.03869522917928667, + "grad_norm": 1.588205099105835, + "learning_rate": 4.981968838155888e-05, + "loss": 0.7983, + "step": 2635 + }, + { + "epoch": 0.03915046716963122, + "grad_norm": 1.6086421012878418, + "learning_rate": 4.980947995086024e-05, + "loss": 0.7961, + "step": 2666 + }, + { + "epoch": 0.03960570515997577, + "grad_norm": 1.680635929107666, + "learning_rate": 4.979899154855234e-05, + "loss": 0.8033, + "step": 2697 + }, + { + "epoch": 0.04006094315032032, + "grad_norm": 1.6833465099334717, + "learning_rate": 4.9788223292995386e-05, + "loss": 0.793, + "step": 2728 + }, + { + "epoch": 0.04051618114066487, + "grad_norm": 1.5738922357559204, + "learning_rate": 4.977717530570768e-05, + "loss": 0.7837, + "step": 2759 + }, + { + "epoch": 0.040971419131009414, + "grad_norm": 1.6513015031814575, + "learning_rate": 4.976584771136425e-05, + "loss": 0.7969, + "step": 2790 + }, + { + "epoch": 0.04142665712135397, + "grad_norm": 1.7180182933807373, + "learning_rate": 4.975424063779547e-05, + "loss": 0.7994, + "step": 2821 + }, + { + "epoch": 0.041881895111698514, + "grad_norm": 1.5236577987670898, + "learning_rate": 4.974235421598557e-05, + "loss": 0.7952, + "step": 2852 + }, + { + "epoch": 0.04233713310204307, + "grad_norm": 1.760188341140747, + "learning_rate": 4.973018858007122e-05, + "loss": 0.795, + "step": 2883 + }, + { + "epoch": 0.042792371092387614, + "grad_norm": 1.5314075946807861, + "learning_rate": 4.9717743867339963e-05, + "loss": 0.7884, + "step": 2914 + }, + { + "epoch": 0.04324760908273216, + "grad_norm": 1.7256252765655518, + "learning_rate": 4.9705020218228695e-05, + "loss": 0.7883, + "step": 2945 + }, + { + "epoch": 0.043702847073076714, + "grad_norm": 1.6262825727462769, + "learning_rate": 4.969201777632205e-05, + "loss": 0.8007, + "step": 2976 + }, + { + "epoch": 0.04415808506342126, + "grad_norm": 1.4939130544662476, + "learning_rate": 4.9678736688350846e-05, + "loss": 0.7862, + "step": 3007 + }, + { + "epoch": 0.04461332305376581, + "grad_norm": 1.6776609420776367, + "learning_rate": 4.966517710419033e-05, + "loss": 0.7977, + "step": 3038 + }, + { + "epoch": 0.04506856104411036, + "grad_norm": 1.7425099611282349, + "learning_rate": 4.965133917685858e-05, + "loss": 0.789, + "step": 3069 + }, + { + "epoch": 0.04552379903445491, + "grad_norm": 1.5114233493804932, + "learning_rate": 4.9637223062514714e-05, + "loss": 0.7915, + "step": 3100 + }, + { + "epoch": 0.045979037024799455, + "grad_norm": 1.6397335529327393, + "learning_rate": 4.962282892045718e-05, + "loss": 0.8127, + "step": 3131 + }, + { + "epoch": 0.04643427501514401, + "grad_norm": 1.7266038656234741, + "learning_rate": 4.9608156913121904e-05, + "loss": 0.827, + "step": 3162 + }, + { + "epoch": 0.046889513005488555, + "grad_norm": 1.5216429233551025, + "learning_rate": 4.959320720608049e-05, + "loss": 0.8146, + "step": 3193 + }, + { + "epoch": 0.0473447509958331, + "grad_norm": 1.9235813617706299, + "learning_rate": 4.9577979968038354e-05, + "loss": 0.8118, + "step": 3224 + }, + { + "epoch": 0.047799988986177655, + "grad_norm": 1.6768368482589722, + "learning_rate": 4.956247537083282e-05, + "loss": 0.7925, + "step": 3255 + }, + { + "epoch": 0.0482552269765222, + "grad_norm": 1.6357700824737549, + "learning_rate": 4.9546693589431145e-05, + "loss": 0.7743, + "step": 3286 + }, + { + "epoch": 0.04871046496686675, + "grad_norm": 1.6823676824569702, + "learning_rate": 4.9530634801928595e-05, + "loss": 0.7949, + "step": 3317 + }, + { + "epoch": 0.0491657029572113, + "grad_norm": 1.4716497659683228, + "learning_rate": 4.9514299189546395e-05, + "loss": 0.7755, + "step": 3348 + }, + { + "epoch": 0.04962094094755585, + "grad_norm": 3.1948273181915283, + "learning_rate": 4.949768693662973e-05, + "loss": 0.7867, + "step": 3379 + }, + { + "epoch": 0.050076178937900395, + "grad_norm": 1.7412703037261963, + "learning_rate": 4.948079823064559e-05, + "loss": 0.788, + "step": 3410 + }, + { + "epoch": 0.05053141692824495, + "grad_norm": 1.5544873476028442, + "learning_rate": 4.946363326218074e-05, + "loss": 0.7752, + "step": 3441 + }, + { + "epoch": 0.050986654918589495, + "grad_norm": 1.5616456270217896, + "learning_rate": 4.9446192224939525e-05, + "loss": 0.7752, + "step": 3472 + }, + { + "epoch": 0.05144189290893404, + "grad_norm": 1.4451948404312134, + "learning_rate": 4.942847531574167e-05, + "loss": 0.7786, + "step": 3503 + }, + { + "epoch": 0.051897130899278596, + "grad_norm": 1.5473634004592896, + "learning_rate": 4.941048273452008e-05, + "loss": 0.7824, + "step": 3534 + }, + { + "epoch": 0.05235236888962314, + "grad_norm": 1.4772653579711914, + "learning_rate": 4.9392214684318605e-05, + "loss": 0.7873, + "step": 3565 + }, + { + "epoch": 0.052807606879967696, + "grad_norm": 1.4708329439163208, + "learning_rate": 4.93736713712897e-05, + "loss": 0.7851, + "step": 3596 + }, + { + "epoch": 0.05326284487031224, + "grad_norm": 1.469369649887085, + "learning_rate": 4.9354853004692124e-05, + "loss": 0.7874, + "step": 3627 + }, + { + "epoch": 0.05371808286065679, + "grad_norm": 1.3600234985351562, + "learning_rate": 4.93357597968886e-05, + "loss": 0.7792, + "step": 3658 + }, + { + "epoch": 0.05417332085100134, + "grad_norm": 1.6479856967926025, + "learning_rate": 4.931639196334338e-05, + "loss": 0.7886, + "step": 3689 + }, + { + "epoch": 0.05462855884134589, + "grad_norm": 1.5576198101043701, + "learning_rate": 4.9296749722619826e-05, + "loss": 0.7873, + "step": 3720 + }, + { + "epoch": 0.055083796831690436, + "grad_norm": 1.5949997901916504, + "learning_rate": 4.9276833296377966e-05, + "loss": 0.7794, + "step": 3751 + }, + { + "epoch": 0.05553903482203499, + "grad_norm": 1.3033227920532227, + "learning_rate": 4.925664290937196e-05, + "loss": 0.7763, + "step": 3782 + }, + { + "epoch": 0.055994272812379536, + "grad_norm": 1.5083190202713013, + "learning_rate": 4.9236178789447576e-05, + "loss": 0.7796, + "step": 3813 + }, + { + "epoch": 0.05644951080272408, + "grad_norm": 1.4691548347473145, + "learning_rate": 4.921544116753962e-05, + "loss": 0.7744, + "step": 3844 + }, + { + "epoch": 0.05690474879306864, + "grad_norm": 1.4447206258773804, + "learning_rate": 4.919443027766935e-05, + "loss": 0.7812, + "step": 3875 + }, + { + "epoch": 0.05735998678341318, + "grad_norm": 1.4616878032684326, + "learning_rate": 4.91731463569418e-05, + "loss": 0.7824, + "step": 3906 + }, + { + "epoch": 0.05781522477375773, + "grad_norm": 1.4808290004730225, + "learning_rate": 4.915158964554312e-05, + "loss": 0.7777, + "step": 3937 + }, + { + "epoch": 0.058270462764102283, + "grad_norm": 1.4712798595428467, + "learning_rate": 4.912976038673786e-05, + "loss": 0.7801, + "step": 3968 + }, + { + "epoch": 0.05872570075444683, + "grad_norm": 1.4899425506591797, + "learning_rate": 4.9107658826866254e-05, + "loss": 0.7852, + "step": 3999 + }, + { + "epoch": 0.05918093874479138, + "grad_norm": 1.2921983003616333, + "learning_rate": 4.908528521534139e-05, + "loss": 0.7735, + "step": 4030 + }, + { + "epoch": 0.05963617673513593, + "grad_norm": 1.3849194049835205, + "learning_rate": 4.906263980464644e-05, + "loss": 0.7893, + "step": 4061 + }, + { + "epoch": 0.06009141472548048, + "grad_norm": 1.442808747291565, + "learning_rate": 4.903972285033178e-05, + "loss": 0.784, + "step": 4092 + }, + { + "epoch": 0.060546652715825024, + "grad_norm": 1.4925036430358887, + "learning_rate": 4.901653461101213e-05, + "loss": 0.7714, + "step": 4123 + }, + { + "epoch": 0.06100189070616958, + "grad_norm": 1.4630024433135986, + "learning_rate": 4.8993075348363626e-05, + "loss": 0.7677, + "step": 4154 + }, + { + "epoch": 0.061457128696514124, + "grad_norm": 1.344081163406372, + "learning_rate": 4.896934532712084e-05, + "loss": 0.7788, + "step": 4185 + }, + { + "epoch": 0.06191236668685868, + "grad_norm": 1.449648380279541, + "learning_rate": 4.8945344815073846e-05, + "loss": 0.7661, + "step": 4216 + }, + { + "epoch": 0.062367604677203224, + "grad_norm": 1.450361728668213, + "learning_rate": 4.892107408306516e-05, + "loss": 0.7698, + "step": 4247 + }, + { + "epoch": 0.06282284266754777, + "grad_norm": 1.5068191289901733, + "learning_rate": 4.889653340498669e-05, + "loss": 0.7758, + "step": 4278 + }, + { + "epoch": 0.06327808065789232, + "grad_norm": 1.3638824224472046, + "learning_rate": 4.8871723057776664e-05, + "loss": 0.7702, + "step": 4309 + }, + { + "epoch": 0.06373331864823686, + "grad_norm": 1.4669525623321533, + "learning_rate": 4.8846643321416476e-05, + "loss": 0.7755, + "step": 4340 + }, + { + "epoch": 0.06418855663858142, + "grad_norm": 1.502684235572815, + "learning_rate": 4.882129447892753e-05, + "loss": 0.7677, + "step": 4371 + }, + { + "epoch": 0.06464379462892597, + "grad_norm": 1.5910828113555908, + "learning_rate": 4.8795676816368076e-05, + "loss": 0.7646, + "step": 4402 + }, + { + "epoch": 0.06509903261927051, + "grad_norm": 1.4607537984848022, + "learning_rate": 4.876979062282995e-05, + "loss": 0.7684, + "step": 4433 + }, + { + "epoch": 0.06555427060961506, + "grad_norm": 1.3823063373565674, + "learning_rate": 4.8743636190435325e-05, + "loss": 0.7695, + "step": 4464 + }, + { + "epoch": 0.06600950859995962, + "grad_norm": 1.3769381046295166, + "learning_rate": 4.871721381433344e-05, + "loss": 0.7746, + "step": 4495 + }, + { + "epoch": 0.06646474659030417, + "grad_norm": 1.3154246807098389, + "learning_rate": 4.869052379269719e-05, + "loss": 0.7699, + "step": 4526 + }, + { + "epoch": 0.06691998458064871, + "grad_norm": 1.4395617246627808, + "learning_rate": 4.866356642671985e-05, + "loss": 0.7654, + "step": 4557 + }, + { + "epoch": 0.06737522257099327, + "grad_norm": 1.333095908164978, + "learning_rate": 4.8636342020611634e-05, + "loss": 0.7729, + "step": 4588 + }, + { + "epoch": 0.06783046056133782, + "grad_norm": 1.3427667617797852, + "learning_rate": 4.860885088159626e-05, + "loss": 0.7701, + "step": 4619 + }, + { + "epoch": 0.06828569855168236, + "grad_norm": 1.4095017910003662, + "learning_rate": 4.858109331990751e-05, + "loss": 0.7711, + "step": 4650 + }, + { + "epoch": 0.06874093654202691, + "grad_norm": 1.4199680089950562, + "learning_rate": 4.855306964878567e-05, + "loss": 0.7797, + "step": 4681 + }, + { + "epoch": 0.06919617453237147, + "grad_norm": 1.3505098819732666, + "learning_rate": 4.8524780184474084e-05, + "loss": 0.7702, + "step": 4712 + }, + { + "epoch": 0.069651412522716, + "grad_norm": 1.3765984773635864, + "learning_rate": 4.8496225246215496e-05, + "loss": 0.7647, + "step": 4743 + }, + { + "epoch": 0.07010665051306056, + "grad_norm": 1.539703369140625, + "learning_rate": 4.8467405156248505e-05, + "loss": 0.7673, + "step": 4774 + }, + { + "epoch": 0.07056188850340511, + "grad_norm": 1.6388696432113647, + "learning_rate": 4.843832023980392e-05, + "loss": 0.7672, + "step": 4805 + }, + { + "epoch": 0.07101712649374965, + "grad_norm": 1.8714436292648315, + "learning_rate": 4.840897082510106e-05, + "loss": 0.7661, + "step": 4836 + }, + { + "epoch": 0.0714723644840942, + "grad_norm": 1.5175424814224243, + "learning_rate": 4.8379357243344084e-05, + "loss": 0.7709, + "step": 4867 + }, + { + "epoch": 0.07192760247443876, + "grad_norm": 1.3103501796722412, + "learning_rate": 4.8349479828718236e-05, + "loss": 0.7701, + "step": 4898 + }, + { + "epoch": 0.0723828404647833, + "grad_norm": 1.4692376852035522, + "learning_rate": 4.8319338918386075e-05, + "loss": 0.7602, + "step": 4929 + }, + { + "epoch": 0.07283807845512785, + "grad_norm": 1.390416145324707, + "learning_rate": 4.828893485248369e-05, + "loss": 0.7896, + "step": 4960 + }, + { + "epoch": 0.0732933164454724, + "grad_norm": 1.4157830476760864, + "learning_rate": 4.825826797411682e-05, + "loss": 0.7675, + "step": 4991 + }, + { + "epoch": 0.07374855443581695, + "grad_norm": 1.2405292987823486, + "learning_rate": 4.822733862935702e-05, + "loss": 0.7679, + "step": 5022 + }, + { + "epoch": 0.0742037924261615, + "grad_norm": 1.4902536869049072, + "learning_rate": 4.819614716723775e-05, + "loss": 0.7693, + "step": 5053 + }, + { + "epoch": 0.07465903041650605, + "grad_norm": 1.4391045570373535, + "learning_rate": 4.8164693939750425e-05, + "loss": 0.7699, + "step": 5084 + }, + { + "epoch": 0.07511426840685059, + "grad_norm": 1.4432806968688965, + "learning_rate": 4.813297930184042e-05, + "loss": 0.763, + "step": 5115 + }, + { + "epoch": 0.07556950639719515, + "grad_norm": 1.3630146980285645, + "learning_rate": 4.810100361140314e-05, + "loss": 0.7618, + "step": 5146 + }, + { + "epoch": 0.0760247443875397, + "grad_norm": 1.5038788318634033, + "learning_rate": 4.8068767229279885e-05, + "loss": 0.7685, + "step": 5177 + }, + { + "epoch": 0.07647998237788424, + "grad_norm": 1.4100326299667358, + "learning_rate": 4.8036270519253854e-05, + "loss": 0.7645, + "step": 5208 + }, + { + "epoch": 0.0769352203682288, + "grad_norm": 1.377821445465088, + "learning_rate": 4.8003513848046e-05, + "loss": 0.7643, + "step": 5239 + }, + { + "epoch": 0.07739045835857335, + "grad_norm": 1.3848881721496582, + "learning_rate": 4.79704975853109e-05, + "loss": 0.7711, + "step": 5270 + }, + { + "epoch": 0.07784569634891789, + "grad_norm": 1.6327494382858276, + "learning_rate": 4.793722210363262e-05, + "loss": 0.7591, + "step": 5301 + }, + { + "epoch": 0.07830093433926244, + "grad_norm": 1.51273512840271, + "learning_rate": 4.7903687778520414e-05, + "loss": 0.7697, + "step": 5332 + }, + { + "epoch": 0.078756172329607, + "grad_norm": 1.447487235069275, + "learning_rate": 4.7869894988404593e-05, + "loss": 0.7602, + "step": 5363 + }, + { + "epoch": 0.07921141031995153, + "grad_norm": 1.3227125406265259, + "learning_rate": 4.783584411463221e-05, + "loss": 0.7616, + "step": 5394 + }, + { + "epoch": 0.07966664831029609, + "grad_norm": 1.4407910108566284, + "learning_rate": 4.780153554146274e-05, + "loss": 0.7618, + "step": 5425 + }, + { + "epoch": 0.08012188630064064, + "grad_norm": 1.2349289655685425, + "learning_rate": 4.7766969656063766e-05, + "loss": 0.7688, + "step": 5456 + }, + { + "epoch": 0.08057712429098518, + "grad_norm": 1.3858132362365723, + "learning_rate": 4.773214684850662e-05, + "loss": 0.7517, + "step": 5487 + }, + { + "epoch": 0.08103236228132973, + "grad_norm": 1.2951774597167969, + "learning_rate": 4.769706751176193e-05, + "loss": 0.7588, + "step": 5518 + }, + { + "epoch": 0.08148760027167429, + "grad_norm": 1.2295372486114502, + "learning_rate": 4.7661732041695264e-05, + "loss": 0.7522, + "step": 5549 + }, + { + "epoch": 0.08194283826201883, + "grad_norm": 1.4258685111999512, + "learning_rate": 4.762614083706258e-05, + "loss": 0.7666, + "step": 5580 + }, + { + "epoch": 0.08239807625236338, + "grad_norm": 1.5323545932769775, + "learning_rate": 4.759029429950581e-05, + "loss": 0.7587, + "step": 5611 + }, + { + "epoch": 0.08285331424270793, + "grad_norm": 1.3712198734283447, + "learning_rate": 4.7554192833548235e-05, + "loss": 0.7597, + "step": 5642 + }, + { + "epoch": 0.08330855223305247, + "grad_norm": 1.389145016670227, + "learning_rate": 4.751783684659e-05, + "loss": 0.7484, + "step": 5673 + }, + { + "epoch": 0.08376379022339703, + "grad_norm": 1.466799020767212, + "learning_rate": 4.748122674890348e-05, + "loss": 0.7586, + "step": 5704 + }, + { + "epoch": 0.08421902821374158, + "grad_norm": 1.4522569179534912, + "learning_rate": 4.7444362953628654e-05, + "loss": 0.7615, + "step": 5735 + }, + { + "epoch": 0.08467426620408613, + "grad_norm": 1.6374691724777222, + "learning_rate": 4.7407245876768424e-05, + "loss": 0.7573, + "step": 5766 + }, + { + "epoch": 0.08512950419443067, + "grad_norm": 1.384748101234436, + "learning_rate": 4.736987593718397e-05, + "loss": 0.7516, + "step": 5797 + }, + { + "epoch": 0.08558474218477523, + "grad_norm": 1.366335153579712, + "learning_rate": 4.733225355658999e-05, + "loss": 0.7681, + "step": 5828 + }, + { + "epoch": 0.08603998017511978, + "grad_norm": 1.3855708837509155, + "learning_rate": 4.7294379159549926e-05, + "loss": 0.7557, + "step": 5859 + }, + { + "epoch": 0.08649521816546432, + "grad_norm": 1.3614453077316284, + "learning_rate": 4.725625317347119e-05, + "loss": 0.7624, + "step": 5890 + }, + { + "epoch": 0.08695045615580888, + "grad_norm": 1.2072994709014893, + "learning_rate": 4.7217876028600374e-05, + "loss": 0.7545, + "step": 5921 + }, + { + "epoch": 0.08740569414615343, + "grad_norm": 1.5530472993850708, + "learning_rate": 4.717924815801832e-05, + "loss": 0.7574, + "step": 5952 + }, + { + "epoch": 0.08786093213649797, + "grad_norm": 1.3522703647613525, + "learning_rate": 4.714036999763532e-05, + "loss": 0.7608, + "step": 5983 + }, + { + "epoch": 0.08831617012684252, + "grad_norm": 1.342637300491333, + "learning_rate": 4.7101241986186116e-05, + "loss": 0.7519, + "step": 6014 + }, + { + "epoch": 0.08877140811718708, + "grad_norm": 1.2783684730529785, + "learning_rate": 4.7061864565225e-05, + "loss": 0.7565, + "step": 6045 + }, + { + "epoch": 0.08922664610753162, + "grad_norm": 1.1245291233062744, + "learning_rate": 4.702223817912081e-05, + "loss": 0.7655, + "step": 6076 + }, + { + "epoch": 0.08968188409787617, + "grad_norm": 1.2777146100997925, + "learning_rate": 4.698236327505195e-05, + "loss": 0.7526, + "step": 6107 + }, + { + "epoch": 0.09013712208822072, + "grad_norm": 1.2739795446395874, + "learning_rate": 4.694224030300127e-05, + "loss": 0.7594, + "step": 6138 + }, + { + "epoch": 0.09059236007856526, + "grad_norm": 1.358684778213501, + "learning_rate": 4.690186971575107e-05, + "loss": 0.7599, + "step": 6169 + }, + { + "epoch": 0.09104759806890982, + "grad_norm": 1.2908906936645508, + "learning_rate": 4.6861251968877916e-05, + "loss": 0.7564, + "step": 6200 + }, + { + "epoch": 0.09150283605925437, + "grad_norm": 1.2198729515075684, + "learning_rate": 4.68203875207476e-05, + "loss": 0.7595, + "step": 6231 + }, + { + "epoch": 0.09195807404959891, + "grad_norm": 1.337134599685669, + "learning_rate": 4.677927683250983e-05, + "loss": 0.7512, + "step": 6262 + }, + { + "epoch": 0.09241331203994346, + "grad_norm": 1.4034923315048218, + "learning_rate": 4.6737920368093156e-05, + "loss": 0.7587, + "step": 6293 + }, + { + "epoch": 0.09286855003028802, + "grad_norm": 1.2584336996078491, + "learning_rate": 4.669631859419965e-05, + "loss": 0.7532, + "step": 6324 + }, + { + "epoch": 0.09332378802063256, + "grad_norm": 1.4327120780944824, + "learning_rate": 4.6654471980299676e-05, + "loss": 0.7528, + "step": 6355 + }, + { + "epoch": 0.09377902601097711, + "grad_norm": 1.386634111404419, + "learning_rate": 4.661238099862658e-05, + "loss": 0.7455, + "step": 6386 + }, + { + "epoch": 0.09423426400132166, + "grad_norm": 1.373677372932434, + "learning_rate": 4.657004612417138e-05, + "loss": 0.7519, + "step": 6417 + }, + { + "epoch": 0.0946895019916662, + "grad_norm": 1.3846200704574585, + "learning_rate": 4.6527467834677374e-05, + "loss": 0.7505, + "step": 6448 + }, + { + "epoch": 0.09514473998201076, + "grad_norm": 1.3237924575805664, + "learning_rate": 4.648464661063478e-05, + "loss": 0.7562, + "step": 6479 + }, + { + "epoch": 0.09559997797235531, + "grad_norm": 1.3368539810180664, + "learning_rate": 4.6441582935275264e-05, + "loss": 0.7523, + "step": 6510 + }, + { + "epoch": 0.09605521596269985, + "grad_norm": 1.1844121217727661, + "learning_rate": 4.6398277294566586e-05, + "loss": 0.7535, + "step": 6541 + }, + { + "epoch": 0.0965104539530444, + "grad_norm": 1.3107521533966064, + "learning_rate": 4.6354730177207e-05, + "loss": 0.7609, + "step": 6572 + }, + { + "epoch": 0.09696569194338896, + "grad_norm": 1.3678165674209595, + "learning_rate": 4.6310942074619787e-05, + "loss": 0.7513, + "step": 6603 + }, + { + "epoch": 0.0974209299337335, + "grad_norm": 1.2169839143753052, + "learning_rate": 4.626691348094777e-05, + "loss": 0.7464, + "step": 6634 + }, + { + "epoch": 0.09787616792407805, + "grad_norm": 1.5456466674804688, + "learning_rate": 4.622264489304762e-05, + "loss": 0.7431, + "step": 6665 + }, + { + "epoch": 0.0983314059144226, + "grad_norm": 1.3606510162353516, + "learning_rate": 4.617813681048434e-05, + "loss": 0.7481, + "step": 6696 + }, + { + "epoch": 0.09878664390476714, + "grad_norm": 1.1854835748672485, + "learning_rate": 4.61333897355256e-05, + "loss": 0.7491, + "step": 6727 + }, + { + "epoch": 0.0992418818951117, + "grad_norm": 1.1682158708572388, + "learning_rate": 4.608840417313604e-05, + "loss": 0.7433, + "step": 6758 + }, + { + "epoch": 0.09969711988545625, + "grad_norm": 1.3889473676681519, + "learning_rate": 4.6043180630971646e-05, + "loss": 0.7414, + "step": 6789 + }, + { + "epoch": 0.10015235787580079, + "grad_norm": 1.351807713508606, + "learning_rate": 4.599771961937391e-05, + "loss": 0.7501, + "step": 6820 + }, + { + "epoch": 0.10060759586614534, + "grad_norm": 1.4737526178359985, + "learning_rate": 4.5952021651364204e-05, + "loss": 0.7508, + "step": 6851 + }, + { + "epoch": 0.1010628338564899, + "grad_norm": 1.3223670721054077, + "learning_rate": 4.590608724263786e-05, + "loss": 0.7557, + "step": 6882 + }, + { + "epoch": 0.10151807184683444, + "grad_norm": 1.182531714439392, + "learning_rate": 4.585991691155845e-05, + "loss": 0.7595, + "step": 6913 + }, + { + "epoch": 0.10197330983717899, + "grad_norm": 1.3309706449508667, + "learning_rate": 4.581351117915188e-05, + "loss": 0.7516, + "step": 6944 + }, + { + "epoch": 0.10242854782752354, + "grad_norm": 1.1631003618240356, + "learning_rate": 4.5766870569100534e-05, + "loss": 0.7538, + "step": 6975 + }, + { + "epoch": 0.10288378581786808, + "grad_norm": 1.150242805480957, + "learning_rate": 4.571999560773736e-05, + "loss": 0.7583, + "step": 7006 + }, + { + "epoch": 0.10333902380821264, + "grad_norm": 1.2031753063201904, + "learning_rate": 4.5672886824039915e-05, + "loss": 0.743, + "step": 7037 + }, + { + "epoch": 0.10379426179855719, + "grad_norm": 1.3973835706710815, + "learning_rate": 4.5625544749624435e-05, + "loss": 0.7477, + "step": 7068 + }, + { + "epoch": 0.10424949978890174, + "grad_norm": 1.3755607604980469, + "learning_rate": 4.5577969918739794e-05, + "loss": 0.7424, + "step": 7099 + }, + { + "epoch": 0.10470473777924628, + "grad_norm": 1.2959696054458618, + "learning_rate": 4.5530162868261486e-05, + "loss": 0.7432, + "step": 7130 + }, + { + "epoch": 0.10515997576959084, + "grad_norm": 1.4843213558197021, + "learning_rate": 4.548212413768558e-05, + "loss": 0.7519, + "step": 7161 + }, + { + "epoch": 0.10561521375993539, + "grad_norm": 1.3697874546051025, + "learning_rate": 4.543385426912261e-05, + "loss": 0.7546, + "step": 7192 + }, + { + "epoch": 0.10607045175027993, + "grad_norm": 1.3163423538208008, + "learning_rate": 4.53853538072915e-05, + "loss": 0.7328, + "step": 7223 + }, + { + "epoch": 0.10652568974062449, + "grad_norm": 1.200897455215454, + "learning_rate": 4.533662329951336e-05, + "loss": 0.7438, + "step": 7254 + }, + { + "epoch": 0.10698092773096904, + "grad_norm": 1.3377361297607422, + "learning_rate": 4.528766329570536e-05, + "loss": 0.7478, + "step": 7285 + }, + { + "epoch": 0.10743616572131358, + "grad_norm": 1.2304787635803223, + "learning_rate": 4.523847434837447e-05, + "loss": 0.7468, + "step": 7316 + }, + { + "epoch": 0.10789140371165813, + "grad_norm": 1.1947201490402222, + "learning_rate": 4.518905701261128e-05, + "loss": 0.7483, + "step": 7347 + }, + { + "epoch": 0.10834664170200269, + "grad_norm": 1.1942620277404785, + "learning_rate": 4.5139411846083715e-05, + "loss": 0.7522, + "step": 7378 + }, + { + "epoch": 0.10880187969234723, + "grad_norm": 1.2765998840332031, + "learning_rate": 4.508953940903073e-05, + "loss": 0.7425, + "step": 7409 + }, + { + "epoch": 0.10925711768269178, + "grad_norm": 1.1460707187652588, + "learning_rate": 4.5039440264255994e-05, + "loss": 0.7534, + "step": 7440 + }, + { + "epoch": 0.10971235567303633, + "grad_norm": 1.1739740371704102, + "learning_rate": 4.498911497712155e-05, + "loss": 0.7447, + "step": 7471 + }, + { + "epoch": 0.11016759366338087, + "grad_norm": 1.227232575416565, + "learning_rate": 4.493856411554142e-05, + "loss": 0.7452, + "step": 7502 + }, + { + "epoch": 0.11062283165372543, + "grad_norm": 1.3812710046768188, + "learning_rate": 4.4887788249975206e-05, + "loss": 0.7431, + "step": 7533 + }, + { + "epoch": 0.11107806964406998, + "grad_norm": 1.1743565797805786, + "learning_rate": 4.4836787953421656e-05, + "loss": 0.7421, + "step": 7564 + }, + { + "epoch": 0.11153330763441452, + "grad_norm": 1.2436307668685913, + "learning_rate": 4.478556380141218e-05, + "loss": 0.7461, + "step": 7595 + }, + { + "epoch": 0.11198854562475907, + "grad_norm": 1.2472264766693115, + "learning_rate": 4.4734116372004375e-05, + "loss": 0.7456, + "step": 7626 + }, + { + "epoch": 0.11244378361510363, + "grad_norm": 1.2209473848342896, + "learning_rate": 4.4682446245775477e-05, + "loss": 0.7544, + "step": 7657 + }, + { + "epoch": 0.11289902160544817, + "grad_norm": 1.3582866191864014, + "learning_rate": 4.463055400581586e-05, + "loss": 0.7464, + "step": 7688 + }, + { + "epoch": 0.11335425959579272, + "grad_norm": 1.2489479780197144, + "learning_rate": 4.4578440237722374e-05, + "loss": 0.7394, + "step": 7719 + }, + { + "epoch": 0.11380949758613727, + "grad_norm": 1.2384752035140991, + "learning_rate": 4.452610552959183e-05, + "loss": 0.7358, + "step": 7750 + }, + { + "epoch": 0.11426473557648181, + "grad_norm": 1.3618046045303345, + "learning_rate": 4.447355047201428e-05, + "loss": 0.742, + "step": 7781 + }, + { + "epoch": 0.11471997356682637, + "grad_norm": 1.2598398923873901, + "learning_rate": 4.4420775658066414e-05, + "loss": 0.7407, + "step": 7812 + }, + { + "epoch": 0.11517521155717092, + "grad_norm": 1.1453444957733154, + "learning_rate": 4.436778168330484e-05, + "loss": 0.7499, + "step": 7843 + }, + { + "epoch": 0.11563044954751546, + "grad_norm": 1.3463783264160156, + "learning_rate": 4.4314569145759353e-05, + "loss": 0.7388, + "step": 7874 + }, + { + "epoch": 0.11608568753786001, + "grad_norm": 1.2431267499923706, + "learning_rate": 4.42611386459262e-05, + "loss": 0.7487, + "step": 7905 + }, + { + "epoch": 0.11654092552820457, + "grad_norm": 1.2726207971572876, + "learning_rate": 4.420749078676133e-05, + "loss": 0.7468, + "step": 7936 + }, + { + "epoch": 0.1169961635185491, + "grad_norm": 1.1979873180389404, + "learning_rate": 4.4153626173673516e-05, + "loss": 0.7482, + "step": 7967 + }, + { + "epoch": 0.11745140150889366, + "grad_norm": 1.2321964502334595, + "learning_rate": 4.409954541451762e-05, + "loss": 0.7363, + "step": 7998 + }, + { + "epoch": 0.11790663949923821, + "grad_norm": 1.265199065208435, + "learning_rate": 4.404524911958764e-05, + "loss": 0.7466, + "step": 8029 + }, + { + "epoch": 0.11836187748958275, + "grad_norm": 1.3418115377426147, + "learning_rate": 4.399073790160989e-05, + "loss": 0.7433, + "step": 8060 + }, + { + "epoch": 0.11881711547992731, + "grad_norm": 1.276666522026062, + "learning_rate": 4.393601237573607e-05, + "loss": 0.7457, + "step": 8091 + }, + { + "epoch": 0.11927235347027186, + "grad_norm": 1.3559373617172241, + "learning_rate": 4.388107315953628e-05, + "loss": 0.7451, + "step": 8122 + }, + { + "epoch": 0.1197275914606164, + "grad_norm": 1.3017044067382812, + "learning_rate": 4.382592087299212e-05, + "loss": 0.736, + "step": 8153 + }, + { + "epoch": 0.12018282945096095, + "grad_norm": 1.230873703956604, + "learning_rate": 4.377055613848964e-05, + "loss": 0.7409, + "step": 8184 + }, + { + "epoch": 0.12063806744130551, + "grad_norm": 1.3415979146957397, + "learning_rate": 4.3714979580812355e-05, + "loss": 0.7456, + "step": 8215 + }, + { + "epoch": 0.12109330543165005, + "grad_norm": 1.3706104755401611, + "learning_rate": 4.365919182713416e-05, + "loss": 0.7425, + "step": 8246 + }, + { + "epoch": 0.1215485434219946, + "grad_norm": 1.3071256875991821, + "learning_rate": 4.360319350701226e-05, + "loss": 0.7312, + "step": 8277 + }, + { + "epoch": 0.12200378141233915, + "grad_norm": 1.1925092935562134, + "learning_rate": 4.3546985252380115e-05, + "loss": 0.7457, + "step": 8308 + }, + { + "epoch": 0.12245901940268371, + "grad_norm": 1.2254852056503296, + "learning_rate": 4.349056769754021e-05, + "loss": 0.7454, + "step": 8339 + }, + { + "epoch": 0.12291425739302825, + "grad_norm": 1.2121927738189697, + "learning_rate": 4.3433941479156994e-05, + "loss": 0.7451, + "step": 8370 + }, + { + "epoch": 0.1233694953833728, + "grad_norm": 1.4518349170684814, + "learning_rate": 4.3377107236249647e-05, + "loss": 0.7376, + "step": 8401 + }, + { + "epoch": 0.12382473337371736, + "grad_norm": 1.2487318515777588, + "learning_rate": 4.332006561018488e-05, + "loss": 0.7468, + "step": 8432 + }, + { + "epoch": 0.1242799713640619, + "grad_norm": 1.2601664066314697, + "learning_rate": 4.3262817244669683e-05, + "loss": 0.7341, + "step": 8463 + }, + { + "epoch": 0.12473520935440645, + "grad_norm": 2.2060904502868652, + "learning_rate": 4.3205362785744083e-05, + "loss": 0.7421, + "step": 8494 + }, + { + "epoch": 0.125190447344751, + "grad_norm": 1.2388501167297363, + "learning_rate": 4.314770288177384e-05, + "loss": 0.7372, + "step": 8525 + }, + { + "epoch": 0.12564568533509554, + "grad_norm": 1.20314359664917, + "learning_rate": 4.308983818344313e-05, + "loss": 0.7308, + "step": 8556 + }, + { + "epoch": 0.1261009233254401, + "grad_norm": 1.2387797832489014, + "learning_rate": 4.3031769343747206e-05, + "loss": 0.7364, + "step": 8587 + }, + { + "epoch": 0.12655616131578465, + "grad_norm": 1.2150596380233765, + "learning_rate": 4.297349701798505e-05, + "loss": 0.7339, + "step": 8618 + }, + { + "epoch": 0.1270113993061292, + "grad_norm": 1.3160388469696045, + "learning_rate": 4.2915021863751916e-05, + "loss": 0.7448, + "step": 8649 + }, + { + "epoch": 0.12746663729647373, + "grad_norm": 1.2065149545669556, + "learning_rate": 4.285634454093198e-05, + "loss": 0.7373, + "step": 8680 + }, + { + "epoch": 0.12792187528681828, + "grad_norm": 1.2958154678344727, + "learning_rate": 4.279746571169086e-05, + "loss": 0.7282, + "step": 8711 + }, + { + "epoch": 0.12837711327716284, + "grad_norm": 1.2723522186279297, + "learning_rate": 4.2738386040468136e-05, + "loss": 0.7274, + "step": 8742 + }, + { + "epoch": 0.1288323512675074, + "grad_norm": 1.1709808111190796, + "learning_rate": 4.2679106193969866e-05, + "loss": 0.7459, + "step": 8773 + }, + { + "epoch": 0.12928758925785194, + "grad_norm": 1.3450511693954468, + "learning_rate": 4.261962684116106e-05, + "loss": 0.7334, + "step": 8804 + }, + { + "epoch": 0.1297428272481965, + "grad_norm": 1.3592984676361084, + "learning_rate": 4.2559948653258145e-05, + "loss": 0.7415, + "step": 8835 + }, + { + "epoch": 0.13019806523854102, + "grad_norm": 1.2315089702606201, + "learning_rate": 4.250007230372134e-05, + "loss": 0.7296, + "step": 8866 + }, + { + "epoch": 0.13065330322888558, + "grad_norm": 1.0693110227584839, + "learning_rate": 4.2439998468247126e-05, + "loss": 0.7269, + "step": 8897 + }, + { + "epoch": 0.13110854121923013, + "grad_norm": 1.0341922044754028, + "learning_rate": 4.2379727824760566e-05, + "loss": 0.7257, + "step": 8928 + }, + { + "epoch": 0.13156377920957468, + "grad_norm": 1.3500678539276123, + "learning_rate": 4.231926105340768e-05, + "loss": 0.7379, + "step": 8959 + }, + { + "epoch": 0.13201901719991924, + "grad_norm": 1.3482787609100342, + "learning_rate": 4.225859883654776e-05, + "loss": 0.7393, + "step": 8990 + }, + { + "epoch": 0.1324742551902638, + "grad_norm": 1.3126060962677002, + "learning_rate": 4.219774185874569e-05, + "loss": 0.7391, + "step": 9021 + }, + { + "epoch": 0.13292949318060834, + "grad_norm": 1.2438606023788452, + "learning_rate": 4.213669080676418e-05, + "loss": 0.7347, + "step": 9052 + }, + { + "epoch": 0.13338473117095287, + "grad_norm": 1.3756070137023926, + "learning_rate": 4.2075446369556056e-05, + "loss": 0.7321, + "step": 9083 + }, + { + "epoch": 0.13383996916129742, + "grad_norm": 1.189840316772461, + "learning_rate": 4.201400923825648e-05, + "loss": 0.7352, + "step": 9114 + }, + { + "epoch": 0.13429520715164198, + "grad_norm": 1.1982426643371582, + "learning_rate": 4.195238010617511e-05, + "loss": 0.7388, + "step": 9145 + }, + { + "epoch": 0.13475044514198653, + "grad_norm": 1.226491928100586, + "learning_rate": 4.1890559668788344e-05, + "loss": 0.7349, + "step": 9176 + }, + { + "epoch": 0.13520568313233108, + "grad_norm": 1.3030654191970825, + "learning_rate": 4.1828548623731405e-05, + "loss": 0.7371, + "step": 9207 + }, + { + "epoch": 0.13566092112267564, + "grad_norm": 1.17802894115448, + "learning_rate": 4.1766347670790506e-05, + "loss": 0.7431, + "step": 9238 + }, + { + "epoch": 0.13611615911302016, + "grad_norm": 1.2893658876419067, + "learning_rate": 4.170395751189495e-05, + "loss": 0.7258, + "step": 9269 + }, + { + "epoch": 0.13657139710336472, + "grad_norm": 1.193758487701416, + "learning_rate": 4.164137885110921e-05, + "loss": 0.7362, + "step": 9300 + }, + { + "epoch": 0.13702663509370927, + "grad_norm": 1.2750978469848633, + "learning_rate": 4.157861239462495e-05, + "loss": 0.7265, + "step": 9331 + }, + { + "epoch": 0.13748187308405382, + "grad_norm": 1.161464810371399, + "learning_rate": 4.1515658850753114e-05, + "loss": 0.7389, + "step": 9362 + }, + { + "epoch": 0.13793711107439838, + "grad_norm": 1.2749390602111816, + "learning_rate": 4.145251892991588e-05, + "loss": 0.7315, + "step": 9393 + }, + { + "epoch": 0.13839234906474293, + "grad_norm": 1.1729388236999512, + "learning_rate": 4.138919334463868e-05, + "loss": 0.7352, + "step": 9424 + }, + { + "epoch": 0.13884758705508746, + "grad_norm": 1.0987122058868408, + "learning_rate": 4.1325682809542124e-05, + "loss": 0.7347, + "step": 9455 + }, + { + "epoch": 0.139302825045432, + "grad_norm": 1.0855766534805298, + "learning_rate": 4.126198804133398e-05, + "loss": 0.7271, + "step": 9486 + }, + { + "epoch": 0.13975806303577656, + "grad_norm": 1.1782172918319702, + "learning_rate": 4.1198109758801055e-05, + "loss": 0.74, + "step": 9517 + }, + { + "epoch": 0.14021330102612112, + "grad_norm": 1.186962366104126, + "learning_rate": 4.113404868280107e-05, + "loss": 0.725, + "step": 9548 + }, + { + "epoch": 0.14066853901646567, + "grad_norm": 1.2993977069854736, + "learning_rate": 4.106980553625457e-05, + "loss": 0.7346, + "step": 9579 + }, + { + "epoch": 0.14112377700681022, + "grad_norm": 1.154897928237915, + "learning_rate": 4.100538104413674e-05, + "loss": 0.7348, + "step": 9610 + }, + { + "epoch": 0.14157901499715475, + "grad_norm": 1.203722357749939, + "learning_rate": 4.09407759334692e-05, + "loss": 0.7312, + "step": 9641 + }, + { + "epoch": 0.1420342529874993, + "grad_norm": 1.102858543395996, + "learning_rate": 4.087599093331186e-05, + "loss": 0.7227, + "step": 9672 + }, + { + "epoch": 0.14248949097784386, + "grad_norm": 1.2667406797409058, + "learning_rate": 4.081102677475462e-05, + "loss": 0.727, + "step": 9703 + }, + { + "epoch": 0.1429447289681884, + "grad_norm": 1.221291422843933, + "learning_rate": 4.0745884190909194e-05, + "loss": 0.7317, + "step": 9734 + }, + { + "epoch": 0.14339996695853297, + "grad_norm": 1.0426162481307983, + "learning_rate": 4.0680563916900796e-05, + "loss": 0.7299, + "step": 9765 + }, + { + "epoch": 0.14385520494887752, + "grad_norm": 1.2158912420272827, + "learning_rate": 4.0615066689859815e-05, + "loss": 0.7399, + "step": 9796 + }, + { + "epoch": 0.14431044293922204, + "grad_norm": 1.2689307928085327, + "learning_rate": 4.0549393248913584e-05, + "loss": 0.721, + "step": 9827 + }, + { + "epoch": 0.1447656809295666, + "grad_norm": 1.3168833255767822, + "learning_rate": 4.048354433517794e-05, + "loss": 0.7258, + "step": 9858 + }, + { + "epoch": 0.14522091891991115, + "grad_norm": 1.1966122388839722, + "learning_rate": 4.0417520691748916e-05, + "loss": 0.7375, + "step": 9889 + }, + { + "epoch": 0.1456761569102557, + "grad_norm": 1.1252835988998413, + "learning_rate": 4.035132306369438e-05, + "loss": 0.7254, + "step": 9920 + }, + { + "epoch": 0.14613139490060026, + "grad_norm": 1.186324119567871, + "learning_rate": 4.028495219804555e-05, + "loss": 0.7337, + "step": 9951 + }, + { + "epoch": 0.1465866328909448, + "grad_norm": 1.1606496572494507, + "learning_rate": 4.021840884378864e-05, + "loss": 0.7273, + "step": 9982 + }, + { + "epoch": 0.14704187088128934, + "grad_norm": 1.316298007965088, + "learning_rate": 4.015169375185633e-05, + "loss": 0.7276, + "step": 10013 + }, + { + "epoch": 0.1474971088716339, + "grad_norm": 1.224236249923706, + "learning_rate": 4.0084807675119396e-05, + "loss": 0.7328, + "step": 10044 + }, + { + "epoch": 0.14795234686197845, + "grad_norm": 1.227148175239563, + "learning_rate": 4.0017751368378106e-05, + "loss": 0.739, + "step": 10075 + }, + { + "epoch": 0.148407584852323, + "grad_norm": 1.1307867765426636, + "learning_rate": 3.995052558835377e-05, + "loss": 0.7324, + "step": 10106 + }, + { + "epoch": 0.14886282284266755, + "grad_norm": 1.2655375003814697, + "learning_rate": 3.988313109368017e-05, + "loss": 0.7263, + "step": 10137 + }, + { + "epoch": 0.1493180608330121, + "grad_norm": 1.1724634170532227, + "learning_rate": 3.981556864489504e-05, + "loss": 0.7289, + "step": 10168 + }, + { + "epoch": 0.14977329882335666, + "grad_norm": 1.140684723854065, + "learning_rate": 3.974783900443142e-05, + "loss": 0.7309, + "step": 10199 + }, + { + "epoch": 0.15022853681370119, + "grad_norm": 1.167183756828308, + "learning_rate": 3.9679942936609095e-05, + "loss": 0.729, + "step": 10230 + }, + { + "epoch": 0.15068377480404574, + "grad_norm": 1.2216546535491943, + "learning_rate": 3.961188120762596e-05, + "loss": 0.7288, + "step": 10261 + }, + { + "epoch": 0.1511390127943903, + "grad_norm": 1.227397084236145, + "learning_rate": 3.954365458554938e-05, + "loss": 0.7313, + "step": 10292 + }, + { + "epoch": 0.15159425078473485, + "grad_norm": 1.075441837310791, + "learning_rate": 3.947526384030751e-05, + "loss": 0.7306, + "step": 10323 + }, + { + "epoch": 0.1520494887750794, + "grad_norm": 1.1227167844772339, + "learning_rate": 3.9406709743680624e-05, + "loss": 0.7287, + "step": 10354 + }, + { + "epoch": 0.15250472676542395, + "grad_norm": 1.0665740966796875, + "learning_rate": 3.9337993069292366e-05, + "loss": 0.7246, + "step": 10385 + }, + { + "epoch": 0.15295996475576848, + "grad_norm": 1.250580072402954, + "learning_rate": 3.926911459260109e-05, + "loss": 0.7183, + "step": 10416 + }, + { + "epoch": 0.15341520274611303, + "grad_norm": 1.1809351444244385, + "learning_rate": 3.920007509089102e-05, + "loss": 0.7307, + "step": 10447 + }, + { + "epoch": 0.1538704407364576, + "grad_norm": 1.3934037685394287, + "learning_rate": 3.913087534326357e-05, + "loss": 0.7151, + "step": 10478 + }, + { + "epoch": 0.15432567872680214, + "grad_norm": 1.1272441148757935, + "learning_rate": 3.9061516130628475e-05, + "loss": 0.7309, + "step": 10509 + }, + { + "epoch": 0.1547809167171467, + "grad_norm": 1.2410191297531128, + "learning_rate": 3.8991998235695025e-05, + "loss": 0.7375, + "step": 10540 + }, + { + "epoch": 0.15523615470749125, + "grad_norm": 1.1426646709442139, + "learning_rate": 3.8922322442963224e-05, + "loss": 0.7342, + "step": 10571 + }, + { + "epoch": 0.15569139269783577, + "grad_norm": 1.2037526369094849, + "learning_rate": 3.885248953871491e-05, + "loss": 0.7329, + "step": 10602 + }, + { + "epoch": 0.15614663068818033, + "grad_norm": 1.2198660373687744, + "learning_rate": 3.8782500311004915e-05, + "loss": 0.7367, + "step": 10633 + }, + { + "epoch": 0.15660186867852488, + "grad_norm": 1.3129019737243652, + "learning_rate": 3.871235554965218e-05, + "loss": 0.7236, + "step": 10664 + }, + { + "epoch": 0.15705710666886943, + "grad_norm": 1.2182697057724, + "learning_rate": 3.864205604623078e-05, + "loss": 0.721, + "step": 10695 + }, + { + "epoch": 0.157512344659214, + "grad_norm": 1.1494821310043335, + "learning_rate": 3.857160259406107e-05, + "loss": 0.7223, + "step": 10726 + }, + { + "epoch": 0.15796758264955854, + "grad_norm": 1.0680466890335083, + "learning_rate": 3.8500995988200674e-05, + "loss": 0.7224, + "step": 10757 + }, + { + "epoch": 0.15842282063990307, + "grad_norm": 1.155052661895752, + "learning_rate": 3.843023702543556e-05, + "loss": 0.7268, + "step": 10788 + }, + { + "epoch": 0.15887805863024762, + "grad_norm": 1.1415716409683228, + "learning_rate": 3.8359326504270984e-05, + "loss": 0.7167, + "step": 10819 + }, + { + "epoch": 0.15933329662059217, + "grad_norm": 1.1486759185791016, + "learning_rate": 3.828826522492255e-05, + "loss": 0.7236, + "step": 10850 + }, + { + "epoch": 0.15978853461093673, + "grad_norm": 1.1916829347610474, + "learning_rate": 3.821705398930713e-05, + "loss": 0.7269, + "step": 10881 + }, + { + "epoch": 0.16024377260128128, + "grad_norm": 1.180760383605957, + "learning_rate": 3.814569360103385e-05, + "loss": 0.724, + "step": 10912 + }, + { + "epoch": 0.16069901059162583, + "grad_norm": 1.197651982307434, + "learning_rate": 3.807418486539499e-05, + "loss": 0.7265, + "step": 10943 + }, + { + "epoch": 0.16115424858197036, + "grad_norm": 1.1579455137252808, + "learning_rate": 3.80025285893569e-05, + "loss": 0.7247, + "step": 10974 + }, + { + "epoch": 0.16160948657231491, + "grad_norm": 1.1810616254806519, + "learning_rate": 3.793072558155093e-05, + "loss": 0.7256, + "step": 11005 + }, + { + "epoch": 0.16206472456265947, + "grad_norm": 1.1510332822799683, + "learning_rate": 3.785877665226426e-05, + "loss": 0.722, + "step": 11036 + }, + { + "epoch": 0.16251996255300402, + "grad_norm": 1.0499261617660522, + "learning_rate": 3.778668261343079e-05, + "loss": 0.7255, + "step": 11067 + }, + { + "epoch": 0.16297520054334858, + "grad_norm": 1.1509817838668823, + "learning_rate": 3.771444427862192e-05, + "loss": 0.7147, + "step": 11098 + }, + { + "epoch": 0.16343043853369313, + "grad_norm": 1.1638833284378052, + "learning_rate": 3.7642062463037465e-05, + "loss": 0.7187, + "step": 11129 + }, + { + "epoch": 0.16388567652403765, + "grad_norm": 1.024095058441162, + "learning_rate": 3.7569537983496373e-05, + "loss": 0.7244, + "step": 11160 + }, + { + "epoch": 0.1643409145143822, + "grad_norm": 1.1326078176498413, + "learning_rate": 3.749687165842753e-05, + "loss": 0.7221, + "step": 11191 + }, + { + "epoch": 0.16479615250472676, + "grad_norm": 1.1863468885421753, + "learning_rate": 3.7424064307860536e-05, + "loss": 0.725, + "step": 11222 + }, + { + "epoch": 0.16525139049507132, + "grad_norm": 1.2948099374771118, + "learning_rate": 3.735111675341645e-05, + "loss": 0.7249, + "step": 11253 + }, + { + "epoch": 0.16570662848541587, + "grad_norm": 1.1384631395339966, + "learning_rate": 3.7278029818298524e-05, + "loss": 0.7203, + "step": 11284 + }, + { + "epoch": 0.16616186647576042, + "grad_norm": 1.1335748434066772, + "learning_rate": 3.720480432728287e-05, + "loss": 0.7183, + "step": 11315 + }, + { + "epoch": 0.16661710446610495, + "grad_norm": 1.2514820098876953, + "learning_rate": 3.71314411067092e-05, + "loss": 0.7216, + "step": 11346 + }, + { + "epoch": 0.1670723424564495, + "grad_norm": 1.128739833831787, + "learning_rate": 3.70579409844715e-05, + "loss": 0.7234, + "step": 11377 + }, + { + "epoch": 0.16752758044679406, + "grad_norm": 1.152342677116394, + "learning_rate": 3.698430479000865e-05, + "loss": 0.7304, + "step": 11408 + }, + { + "epoch": 0.1679828184371386, + "grad_norm": 1.222424864768982, + "learning_rate": 3.691053335429509e-05, + "loss": 0.7164, + "step": 11439 + }, + { + "epoch": 0.16843805642748316, + "grad_norm": 1.2406268119812012, + "learning_rate": 3.683662750983147e-05, + "loss": 0.7157, + "step": 11470 + }, + { + "epoch": 0.16889329441782772, + "grad_norm": 1.1266759634017944, + "learning_rate": 3.676258809063518e-05, + "loss": 0.7228, + "step": 11501 + }, + { + "epoch": 0.16934853240817227, + "grad_norm": 1.1239662170410156, + "learning_rate": 3.6688415932231004e-05, + "loss": 0.7266, + "step": 11532 + }, + { + "epoch": 0.1698037703985168, + "grad_norm": 1.0813047885894775, + "learning_rate": 3.661411187164166e-05, + "loss": 0.7278, + "step": 11563 + }, + { + "epoch": 0.17025900838886135, + "grad_norm": 56.909889221191406, + "learning_rate": 3.65396767473784e-05, + "loss": 0.7163, + "step": 11594 + }, + { + "epoch": 0.1707142463792059, + "grad_norm": 1.0749647617340088, + "learning_rate": 3.6465111399431465e-05, + "loss": 0.7182, + "step": 11625 + }, + { + "epoch": 0.17116948436955046, + "grad_norm": 1.0705510377883911, + "learning_rate": 3.6390416669260674e-05, + "loss": 0.726, + "step": 11656 + }, + { + "epoch": 0.171624722359895, + "grad_norm": 1.1083126068115234, + "learning_rate": 3.63155933997859e-05, + "loss": 0.7262, + "step": 11687 + }, + { + "epoch": 0.17207996035023956, + "grad_norm": 1.2302770614624023, + "learning_rate": 3.624064243537758e-05, + "loss": 0.7229, + "step": 11718 + }, + { + "epoch": 0.1725351983405841, + "grad_norm": 1.240893006324768, + "learning_rate": 3.616556462184716e-05, + "loss": 0.7228, + "step": 11749 + }, + { + "epoch": 0.17299043633092864, + "grad_norm": 1.1897794008255005, + "learning_rate": 3.609036080643755e-05, + "loss": 0.7178, + "step": 11780 + }, + { + "epoch": 0.1734456743212732, + "grad_norm": 1.244950294494629, + "learning_rate": 3.60150318378136e-05, + "loss": 0.7244, + "step": 11811 + }, + { + "epoch": 0.17390091231161775, + "grad_norm": 1.1689528226852417, + "learning_rate": 3.5939578566052465e-05, + "loss": 0.7181, + "step": 11842 + }, + { + "epoch": 0.1743561503019623, + "grad_norm": 1.1300387382507324, + "learning_rate": 3.586400184263408e-05, + "loss": 0.7176, + "step": 11873 + }, + { + "epoch": 0.17481138829230686, + "grad_norm": 1.0884469747543335, + "learning_rate": 3.578830252043148e-05, + "loss": 0.7182, + "step": 11904 + }, + { + "epoch": 0.17526662628265138, + "grad_norm": 1.119352102279663, + "learning_rate": 3.571248145370125e-05, + "loss": 0.7223, + "step": 11935 + }, + { + "epoch": 0.17572186427299594, + "grad_norm": 1.0979810953140259, + "learning_rate": 3.5636539498073794e-05, + "loss": 0.7233, + "step": 11966 + }, + { + "epoch": 0.1761771022633405, + "grad_norm": 1.138571858406067, + "learning_rate": 3.556047751054378e-05, + "loss": 0.7279, + "step": 11997 + }, + { + "epoch": 0.17663234025368504, + "grad_norm": 1.2011150121688843, + "learning_rate": 3.548429634946039e-05, + "loss": 0.717, + "step": 12028 + }, + { + "epoch": 0.1770875782440296, + "grad_norm": 1.119472861289978, + "learning_rate": 3.540799687451768e-05, + "loss": 0.7227, + "step": 12059 + }, + { + "epoch": 0.17754281623437415, + "grad_norm": 1.089117169380188, + "learning_rate": 3.533157994674485e-05, + "loss": 0.7132, + "step": 12090 + }, + { + "epoch": 0.17799805422471868, + "grad_norm": 1.0937649011611938, + "learning_rate": 3.5255046428496546e-05, + "loss": 0.7285, + "step": 12121 + }, + { + "epoch": 0.17845329221506323, + "grad_norm": 1.0359808206558228, + "learning_rate": 3.517839718344311e-05, + "loss": 0.7178, + "step": 12152 + }, + { + "epoch": 0.17890853020540778, + "grad_norm": 1.1309690475463867, + "learning_rate": 3.510163307656086e-05, + "loss": 0.713, + "step": 12183 + }, + { + "epoch": 0.17936376819575234, + "grad_norm": 1.126597285270691, + "learning_rate": 3.5024754974122324e-05, + "loss": 0.7139, + "step": 12214 + }, + { + "epoch": 0.1798190061860969, + "grad_norm": 1.29429292678833, + "learning_rate": 3.494776374368643e-05, + "loss": 0.7261, + "step": 12245 + }, + { + "epoch": 0.18027424417644144, + "grad_norm": 1.1156132221221924, + "learning_rate": 3.4870660254088724e-05, + "loss": 0.7124, + "step": 12276 + }, + { + "epoch": 0.18072948216678597, + "grad_norm": 1.1548200845718384, + "learning_rate": 3.479344537543164e-05, + "loss": 0.7195, + "step": 12307 + }, + { + "epoch": 0.18118472015713052, + "grad_norm": 1.1318944692611694, + "learning_rate": 3.4716119979074565e-05, + "loss": 0.7215, + "step": 12338 + }, + { + "epoch": 0.18163995814747508, + "grad_norm": 1.1952061653137207, + "learning_rate": 3.463868493762412e-05, + "loss": 0.7159, + "step": 12369 + }, + { + "epoch": 0.18209519613781963, + "grad_norm": 1.1703433990478516, + "learning_rate": 3.456114112492418e-05, + "loss": 0.7145, + "step": 12400 + }, + { + "epoch": 0.18255043412816419, + "grad_norm": 1.065319538116455, + "learning_rate": 3.4483489416046164e-05, + "loss": 0.7108, + "step": 12431 + }, + { + "epoch": 0.18300567211850874, + "grad_norm": 1.278533697128296, + "learning_rate": 3.440573068727905e-05, + "loss": 0.7116, + "step": 12462 + }, + { + "epoch": 0.18346091010885326, + "grad_norm": 1.1642391681671143, + "learning_rate": 3.4327865816119495e-05, + "loss": 0.716, + "step": 12493 + }, + { + "epoch": 0.18391614809919782, + "grad_norm": 1.2108798027038574, + "learning_rate": 3.4249895681262025e-05, + "loss": 0.7194, + "step": 12524 + }, + { + "epoch": 0.18437138608954237, + "grad_norm": 1.085163950920105, + "learning_rate": 3.417182116258899e-05, + "loss": 0.7166, + "step": 12555 + }, + { + "epoch": 0.18482662407988693, + "grad_norm": 1.124064564704895, + "learning_rate": 3.409364314116074e-05, + "loss": 0.7218, + "step": 12586 + }, + { + "epoch": 0.18528186207023148, + "grad_norm": 1.1817457675933838, + "learning_rate": 3.401536249920559e-05, + "loss": 0.7161, + "step": 12617 + }, + { + "epoch": 0.18573710006057603, + "grad_norm": 1.1430065631866455, + "learning_rate": 3.393698012010998e-05, + "loss": 0.7223, + "step": 12648 + }, + { + "epoch": 0.18619233805092056, + "grad_norm": 1.151633620262146, + "learning_rate": 3.385849688840839e-05, + "loss": 0.7181, + "step": 12679 + }, + { + "epoch": 0.1866475760412651, + "grad_norm": 1.1847879886627197, + "learning_rate": 3.3779913689773414e-05, + "loss": 0.7144, + "step": 12710 + }, + { + "epoch": 0.18710281403160967, + "grad_norm": 1.0997257232666016, + "learning_rate": 3.370123141100578e-05, + "loss": 0.7179, + "step": 12741 + }, + { + "epoch": 0.18755805202195422, + "grad_norm": 1.3207893371582031, + "learning_rate": 3.3622450940024305e-05, + "loss": 0.7147, + "step": 12772 + }, + { + "epoch": 0.18801329001229877, + "grad_norm": 1.1414912939071655, + "learning_rate": 3.35435731658559e-05, + "loss": 0.7258, + "step": 12803 + }, + { + "epoch": 0.18846852800264333, + "grad_norm": 1.1708139181137085, + "learning_rate": 3.346459897862552e-05, + "loss": 0.7222, + "step": 12834 + }, + { + "epoch": 0.18892376599298788, + "grad_norm": 1.0472559928894043, + "learning_rate": 3.338552926954613e-05, + "loss": 0.7168, + "step": 12865 + }, + { + "epoch": 0.1893790039833324, + "grad_norm": 1.1117762327194214, + "learning_rate": 3.330636493090868e-05, + "loss": 0.7128, + "step": 12896 + }, + { + "epoch": 0.18983424197367696, + "grad_norm": 1.1351251602172852, + "learning_rate": 3.322710685607193e-05, + "loss": 0.72, + "step": 12927 + }, + { + "epoch": 0.1902894799640215, + "grad_norm": 1.1530694961547852, + "learning_rate": 3.314775593945251e-05, + "loss": 0.7133, + "step": 12958 + }, + { + "epoch": 0.19074471795436607, + "grad_norm": 1.1590766906738281, + "learning_rate": 3.3068313076514714e-05, + "loss": 0.7236, + "step": 12989 + }, + { + "epoch": 0.19119995594471062, + "grad_norm": 1.0657463073730469, + "learning_rate": 3.298877916376047e-05, + "loss": 0.7159, + "step": 13020 + }, + { + "epoch": 0.19165519393505517, + "grad_norm": 1.090234637260437, + "learning_rate": 3.290915509871915e-05, + "loss": 0.7249, + "step": 13051 + }, + { + "epoch": 0.1921104319253997, + "grad_norm": 1.1001083850860596, + "learning_rate": 3.282944177993753e-05, + "loss": 0.721, + "step": 13082 + }, + { + "epoch": 0.19256566991574425, + "grad_norm": 1.0616897344589233, + "learning_rate": 3.274964010696957e-05, + "loss": 0.7145, + "step": 13113 + }, + { + "epoch": 0.1930209079060888, + "grad_norm": 1.05698823928833, + "learning_rate": 3.266975098036629e-05, + "loss": 0.7106, + "step": 13144 + }, + { + "epoch": 0.19347614589643336, + "grad_norm": 1.0747356414794922, + "learning_rate": 3.258977530166562e-05, + "loss": 0.7199, + "step": 13175 + }, + { + "epoch": 0.1939313838867779, + "grad_norm": 1.112629771232605, + "learning_rate": 3.250971397338227e-05, + "loss": 0.7088, + "step": 13206 + }, + { + "epoch": 0.19438662187712247, + "grad_norm": 1.13507878780365, + "learning_rate": 3.2429567898997404e-05, + "loss": 0.7084, + "step": 13237 + }, + { + "epoch": 0.194841859867467, + "grad_norm": 1.137793779373169, + "learning_rate": 3.234933798294859e-05, + "loss": 0.7121, + "step": 13268 + }, + { + "epoch": 0.19529709785781155, + "grad_norm": 1.263027548789978, + "learning_rate": 3.2269025130619535e-05, + "loss": 0.7117, + "step": 13299 + }, + { + "epoch": 0.1957523358481561, + "grad_norm": 1.1799430847167969, + "learning_rate": 3.218863024832985e-05, + "loss": 0.7144, + "step": 13330 + }, + { + "epoch": 0.19620757383850065, + "grad_norm": 1.040297508239746, + "learning_rate": 3.2108154243324864e-05, + "loss": 0.7089, + "step": 13361 + }, + { + "epoch": 0.1966628118288452, + "grad_norm": 1.110816240310669, + "learning_rate": 3.2027598023765345e-05, + "loss": 0.7166, + "step": 13392 + }, + { + "epoch": 0.19711804981918976, + "grad_norm": 1.0399518013000488, + "learning_rate": 3.194696249871729e-05, + "loss": 0.7081, + "step": 13423 + }, + { + "epoch": 0.1975732878095343, + "grad_norm": 1.1946238279342651, + "learning_rate": 3.186624857814164e-05, + "loss": 0.7112, + "step": 13454 + }, + { + "epoch": 0.19802852579987884, + "grad_norm": 1.1289162635803223, + "learning_rate": 3.178545717288401e-05, + "loss": 0.7079, + "step": 13485 + }, + { + "epoch": 0.1984837637902234, + "grad_norm": 1.17491614818573, + "learning_rate": 3.170458919466444e-05, + "loss": 0.7117, + "step": 13516 + }, + { + "epoch": 0.19893900178056795, + "grad_norm": 1.1498847007751465, + "learning_rate": 3.1623645556067063e-05, + "loss": 0.7131, + "step": 13547 + }, + { + "epoch": 0.1993942397709125, + "grad_norm": 1.2516326904296875, + "learning_rate": 3.154262717052985e-05, + "loss": 0.7145, + "step": 13578 + }, + { + "epoch": 0.19984947776125705, + "grad_norm": 1.2112692594528198, + "learning_rate": 3.146153495233426e-05, + "loss": 0.7098, + "step": 13609 + }, + { + "epoch": 0.20030471575160158, + "grad_norm": 0.989539384841919, + "learning_rate": 3.1380369816594944e-05, + "loss": 0.7182, + "step": 13640 + }, + { + "epoch": 0.20075995374194613, + "grad_norm": 1.0849697589874268, + "learning_rate": 3.129913267924946e-05, + "loss": 0.7153, + "step": 13671 + }, + { + "epoch": 0.2012151917322907, + "grad_norm": 1.1133675575256348, + "learning_rate": 3.121782445704782e-05, + "loss": 0.7104, + "step": 13702 + }, + { + "epoch": 0.20167042972263524, + "grad_norm": 1.1086009740829468, + "learning_rate": 3.11364460675423e-05, + "loss": 0.7125, + "step": 13733 + }, + { + "epoch": 0.2021256677129798, + "grad_norm": 1.0160905122756958, + "learning_rate": 3.1054998429076934e-05, + "loss": 0.7055, + "step": 13764 + }, + { + "epoch": 0.20258090570332435, + "grad_norm": 1.0811957120895386, + "learning_rate": 3.097348246077728e-05, + "loss": 0.7105, + "step": 13795 + }, + { + "epoch": 0.20303614369366887, + "grad_norm": 1.0012198686599731, + "learning_rate": 3.0891899082539924e-05, + "loss": 0.7134, + "step": 13826 + }, + { + "epoch": 0.20349138168401343, + "grad_norm": 1.2298318147659302, + "learning_rate": 3.0810249215022233e-05, + "loss": 0.7054, + "step": 13857 + }, + { + "epoch": 0.20394661967435798, + "grad_norm": 1.1324481964111328, + "learning_rate": 3.0728533779631865e-05, + "loss": 0.7023, + "step": 13888 + }, + { + "epoch": 0.20440185766470254, + "grad_norm": 1.1454377174377441, + "learning_rate": 3.064675369851637e-05, + "loss": 0.7124, + "step": 13919 + }, + { + "epoch": 0.2048570956550471, + "grad_norm": 1.0461289882659912, + "learning_rate": 3.056490989455289e-05, + "loss": 0.7159, + "step": 13950 + }, + { + "epoch": 0.20531233364539164, + "grad_norm": 1.0020164251327515, + "learning_rate": 3.0483003291337596e-05, + "loss": 0.702, + "step": 13981 + }, + { + "epoch": 0.20576757163573617, + "grad_norm": 1.0952621698379517, + "learning_rate": 3.040103481317539e-05, + "loss": 0.7108, + "step": 14012 + }, + { + "epoch": 0.20622280962608072, + "grad_norm": 1.129813551902771, + "learning_rate": 3.03190053850694e-05, + "loss": 0.7043, + "step": 14043 + }, + { + "epoch": 0.20667804761642528, + "grad_norm": 1.012137770652771, + "learning_rate": 3.0236915932710573e-05, + "loss": 0.7002, + "step": 14074 + }, + { + "epoch": 0.20713328560676983, + "grad_norm": 1.1774550676345825, + "learning_rate": 3.0154767382467232e-05, + "loss": 0.6996, + "step": 14105 + }, + { + "epoch": 0.20758852359711438, + "grad_norm": 1.0878807306289673, + "learning_rate": 3.0072560661374582e-05, + "loss": 0.7075, + "step": 14136 + }, + { + "epoch": 0.20804376158745894, + "grad_norm": 1.1599992513656616, + "learning_rate": 2.999029669712431e-05, + "loss": 0.7165, + "step": 14167 + }, + { + "epoch": 0.2084989995778035, + "grad_norm": 1.184403419494629, + "learning_rate": 2.990797641805408e-05, + "loss": 0.709, + "step": 14198 + }, + { + "epoch": 0.20895423756814802, + "grad_norm": 1.0857961177825928, + "learning_rate": 2.982560075313704e-05, + "loss": 0.6986, + "step": 14229 + }, + { + "epoch": 0.20940947555849257, + "grad_norm": 1.0922378301620483, + "learning_rate": 2.9743170631971368e-05, + "loss": 0.7085, + "step": 14260 + }, + { + "epoch": 0.20986471354883712, + "grad_norm": 1.0843058824539185, + "learning_rate": 2.9660686984769792e-05, + "loss": 0.7114, + "step": 14291 + }, + { + "epoch": 0.21031995153918168, + "grad_norm": 1.2273341417312622, + "learning_rate": 2.9578150742349047e-05, + "loss": 0.7092, + "step": 14322 + }, + { + "epoch": 0.21077518952952623, + "grad_norm": 1.1041830778121948, + "learning_rate": 2.949556283611942e-05, + "loss": 0.7086, + "step": 14353 + }, + { + "epoch": 0.21123042751987078, + "grad_norm": 1.1019948720932007, + "learning_rate": 2.9412924198074206e-05, + "loss": 0.7095, + "step": 14384 + }, + { + "epoch": 0.2116856655102153, + "grad_norm": 1.1595929861068726, + "learning_rate": 2.9330235760779208e-05, + "loss": 0.7083, + "step": 14415 + }, + { + "epoch": 0.21214090350055986, + "grad_norm": 1.252021312713623, + "learning_rate": 2.9247498457362188e-05, + "loss": 0.7045, + "step": 14446 + }, + { + "epoch": 0.21259614149090442, + "grad_norm": 1.1469565629959106, + "learning_rate": 2.9164713221502373e-05, + "loss": 0.6995, + "step": 14477 + }, + { + "epoch": 0.21305137948124897, + "grad_norm": 1.1796189546585083, + "learning_rate": 2.9081880987419912e-05, + "loss": 0.7101, + "step": 14508 + }, + { + "epoch": 0.21350661747159352, + "grad_norm": 1.183271884918213, + "learning_rate": 2.8999002689865296e-05, + "loss": 0.7074, + "step": 14539 + }, + { + "epoch": 0.21396185546193808, + "grad_norm": 1.2489229440689087, + "learning_rate": 2.8916079264108852e-05, + "loss": 0.7126, + "step": 14570 + }, + { + "epoch": 0.2144170934522826, + "grad_norm": 1.059584379196167, + "learning_rate": 2.883311164593017e-05, + "loss": 0.7045, + "step": 14601 + }, + { + "epoch": 0.21487233144262716, + "grad_norm": 1.1565430164337158, + "learning_rate": 2.875010077160754e-05, + "loss": 0.7075, + "step": 14632 + }, + { + "epoch": 0.2153275694329717, + "grad_norm": 1.0368664264678955, + "learning_rate": 2.866704757790741e-05, + "loss": 0.7065, + "step": 14663 + }, + { + "epoch": 0.21578280742331626, + "grad_norm": 1.1395366191864014, + "learning_rate": 2.858395300207376e-05, + "loss": 0.7029, + "step": 14694 + }, + { + "epoch": 0.21623804541366082, + "grad_norm": 1.077816128730774, + "learning_rate": 2.8500817981817607e-05, + "loss": 0.7078, + "step": 14725 + }, + { + "epoch": 0.21669328340400537, + "grad_norm": 1.2020843029022217, + "learning_rate": 2.8417643455306336e-05, + "loss": 0.7011, + "step": 14756 + }, + { + "epoch": 0.2171485213943499, + "grad_norm": 1.0293958187103271, + "learning_rate": 2.8334430361153185e-05, + "loss": 0.7038, + "step": 14787 + }, + { + "epoch": 0.21760375938469445, + "grad_norm": 1.2163504362106323, + "learning_rate": 2.8251179638406612e-05, + "loss": 0.7029, + "step": 14818 + }, + { + "epoch": 0.218058997375039, + "grad_norm": 1.0222349166870117, + "learning_rate": 2.8167892226539704e-05, + "loss": 0.7084, + "step": 14849 + }, + { + "epoch": 0.21851423536538356, + "grad_norm": 1.0532034635543823, + "learning_rate": 2.8084569065439588e-05, + "loss": 0.7162, + "step": 14880 + }, + { + "epoch": 0.2189694733557281, + "grad_norm": 0.9981489181518555, + "learning_rate": 2.8001211095396807e-05, + "loss": 0.7088, + "step": 14911 + }, + { + "epoch": 0.21942471134607267, + "grad_norm": 1.0804964303970337, + "learning_rate": 2.791781925709473e-05, + "loss": 0.7035, + "step": 14942 + }, + { + "epoch": 0.2198799493364172, + "grad_norm": 1.1326287984848022, + "learning_rate": 2.7834394491598908e-05, + "loss": 0.708, + "step": 14973 + }, + { + "epoch": 0.22033518732676174, + "grad_norm": 1.1164413690567017, + "learning_rate": 2.7750937740346485e-05, + "loss": 0.7047, + "step": 15004 + }, + { + "epoch": 0.2207904253171063, + "grad_norm": 1.1455166339874268, + "learning_rate": 2.7667449945135564e-05, + "loss": 0.708, + "step": 15035 + }, + { + "epoch": 0.22124566330745085, + "grad_norm": 1.1329716444015503, + "learning_rate": 2.7583932048114557e-05, + "loss": 0.6923, + "step": 15066 + }, + { + "epoch": 0.2217009012977954, + "grad_norm": 1.3479692935943604, + "learning_rate": 2.7500384991771587e-05, + "loss": 0.7119, + "step": 15097 + }, + { + "epoch": 0.22215613928813996, + "grad_norm": 1.1160681247711182, + "learning_rate": 2.7416809718923825e-05, + "loss": 0.7031, + "step": 15128 + }, + { + "epoch": 0.22261137727848448, + "grad_norm": 1.1188467741012573, + "learning_rate": 2.7333207172706864e-05, + "loss": 0.7029, + "step": 15159 + }, + { + "epoch": 0.22306661526882904, + "grad_norm": 0.9910608530044556, + "learning_rate": 2.7249578296564088e-05, + "loss": 0.6996, + "step": 15190 + }, + { + "epoch": 0.2235218532591736, + "grad_norm": 1.100176453590393, + "learning_rate": 2.7165924034235973e-05, + "loss": 0.7027, + "step": 15221 + }, + { + "epoch": 0.22397709124951815, + "grad_norm": 1.1848573684692383, + "learning_rate": 2.708224532974953e-05, + "loss": 0.7111, + "step": 15252 + }, + { + "epoch": 0.2244323292398627, + "grad_norm": 1.1293883323669434, + "learning_rate": 2.6998543127407538e-05, + "loss": 0.7045, + "step": 15283 + }, + { + "epoch": 0.22488756723020725, + "grad_norm": 1.1224740743637085, + "learning_rate": 2.6914818371777988e-05, + "loss": 0.7113, + "step": 15314 + }, + { + "epoch": 0.2253428052205518, + "grad_norm": 1.2419655323028564, + "learning_rate": 2.6831072007683373e-05, + "loss": 0.7062, + "step": 15345 + }, + { + "epoch": 0.22579804321089633, + "grad_norm": 1.1906564235687256, + "learning_rate": 2.6747304980190018e-05, + "loss": 0.7112, + "step": 15376 + }, + { + "epoch": 0.22625328120124089, + "grad_norm": 1.0610102415084839, + "learning_rate": 2.6663518234597453e-05, + "loss": 0.7052, + "step": 15407 + }, + { + "epoch": 0.22670851919158544, + "grad_norm": 1.3254245519638062, + "learning_rate": 2.6579712716427696e-05, + "loss": 0.6975, + "step": 15438 + }, + { + "epoch": 0.22716375718193, + "grad_norm": 1.1128469705581665, + "learning_rate": 2.6495889371414652e-05, + "loss": 0.6993, + "step": 15469 + }, + { + "epoch": 0.22761899517227455, + "grad_norm": 1.0977287292480469, + "learning_rate": 2.6412049145493367e-05, + "loss": 0.7001, + "step": 15500 + }, + { + "epoch": 0.2280742331626191, + "grad_norm": 0.9699016213417053, + "learning_rate": 2.632819298478939e-05, + "loss": 0.7082, + "step": 15531 + }, + { + "epoch": 0.22852947115296363, + "grad_norm": 1.1493170261383057, + "learning_rate": 2.6244321835608105e-05, + "loss": 0.7019, + "step": 15562 + }, + { + "epoch": 0.22898470914330818, + "grad_norm": 1.1549670696258545, + "learning_rate": 2.6160436644424024e-05, + "loss": 0.7087, + "step": 15593 + }, + { + "epoch": 0.22943994713365273, + "grad_norm": 1.2285927534103394, + "learning_rate": 2.6076538357870133e-05, + "loss": 0.695, + "step": 15624 + }, + { + "epoch": 0.2298951851239973, + "grad_norm": 1.0625406503677368, + "learning_rate": 2.5992627922727196e-05, + "loss": 0.7072, + "step": 15655 + }, + { + "epoch": 0.23035042311434184, + "grad_norm": 1.2031610012054443, + "learning_rate": 2.5908706285913066e-05, + "loss": 0.6952, + "step": 15686 + }, + { + "epoch": 0.2308056611046864, + "grad_norm": 1.0590460300445557, + "learning_rate": 2.5824774394472008e-05, + "loss": 0.6981, + "step": 15717 + }, + { + "epoch": 0.23126089909503092, + "grad_norm": 1.0085610151290894, + "learning_rate": 2.5740833195563996e-05, + "loss": 0.7006, + "step": 15748 + }, + { + "epoch": 0.23171613708537547, + "grad_norm": 1.1644418239593506, + "learning_rate": 2.5656883636454067e-05, + "loss": 0.7023, + "step": 15779 + }, + { + "epoch": 0.23217137507572003, + "grad_norm": 1.0243310928344727, + "learning_rate": 2.557292666450159e-05, + "loss": 0.7106, + "step": 15810 + }, + { + "epoch": 0.23262661306606458, + "grad_norm": 1.0970982313156128, + "learning_rate": 2.5488963227149566e-05, + "loss": 0.7018, + "step": 15841 + }, + { + "epoch": 0.23308185105640913, + "grad_norm": 1.0774227380752563, + "learning_rate": 2.5404994271913983e-05, + "loss": 0.7058, + "step": 15872 + }, + { + "epoch": 0.2335370890467537, + "grad_norm": 1.2018071413040161, + "learning_rate": 2.5321020746373085e-05, + "loss": 0.7072, + "step": 15903 + }, + { + "epoch": 0.2339923270370982, + "grad_norm": 1.1195406913757324, + "learning_rate": 2.52370435981567e-05, + "loss": 0.6971, + "step": 15934 + }, + { + "epoch": 0.23444756502744277, + "grad_norm": 1.0707147121429443, + "learning_rate": 2.5153063774935533e-05, + "loss": 0.7005, + "step": 15965 + }, + { + "epoch": 0.23490280301778732, + "grad_norm": 1.0488064289093018, + "learning_rate": 2.506908222441045e-05, + "loss": 0.6994, + "step": 15996 + }, + { + "epoch": 0.23535804100813187, + "grad_norm": 1.0699859857559204, + "learning_rate": 2.498509989430187e-05, + "loss": 0.6931, + "step": 16027 + }, + { + "epoch": 0.23581327899847643, + "grad_norm": 1.0461689233779907, + "learning_rate": 2.4901117732338958e-05, + "loss": 0.7022, + "step": 16058 + }, + { + "epoch": 0.23626851698882098, + "grad_norm": 1.096604824066162, + "learning_rate": 2.481713668624899e-05, + "loss": 0.7043, + "step": 16089 + }, + { + "epoch": 0.2367237549791655, + "grad_norm": 1.0687739849090576, + "learning_rate": 2.4733157703746663e-05, + "loss": 0.7043, + "step": 16120 + }, + { + "epoch": 0.23717899296951006, + "grad_norm": 1.1307755708694458, + "learning_rate": 2.4649181732523392e-05, + "loss": 0.7059, + "step": 16151 + }, + { + "epoch": 0.23763423095985461, + "grad_norm": 1.0404301881790161, + "learning_rate": 2.4565209720236582e-05, + "loss": 0.6981, + "step": 16182 + }, + { + "epoch": 0.23808946895019917, + "grad_norm": 1.0836886167526245, + "learning_rate": 2.4481242614498975e-05, + "loss": 0.7145, + "step": 16213 + }, + { + "epoch": 0.23854470694054372, + "grad_norm": 1.0622589588165283, + "learning_rate": 2.439728136286796e-05, + "loss": 0.7069, + "step": 16244 + }, + { + "epoch": 0.23899994493088828, + "grad_norm": 1.1610299348831177, + "learning_rate": 2.4313326912834852e-05, + "loss": 0.7022, + "step": 16275 + }, + { + "epoch": 0.2394551829212328, + "grad_norm": 1.004273772239685, + "learning_rate": 2.4229380211814206e-05, + "loss": 0.6905, + "step": 16306 + }, + { + "epoch": 0.23991042091157735, + "grad_norm": 1.0684071779251099, + "learning_rate": 2.4145442207133124e-05, + "loss": 0.6977, + "step": 16337 + }, + { + "epoch": 0.2403656589019219, + "grad_norm": 0.9177312850952148, + "learning_rate": 2.406151384602059e-05, + "loss": 0.6979, + "step": 16368 + }, + { + "epoch": 0.24082089689226646, + "grad_norm": 1.0734107494354248, + "learning_rate": 2.3977596075596747e-05, + "loss": 0.6987, + "step": 16399 + }, + { + "epoch": 0.24127613488261102, + "grad_norm": 1.1414164304733276, + "learning_rate": 2.3893689842862223e-05, + "loss": 0.6927, + "step": 16430 + }, + { + "epoch": 0.24173137287295557, + "grad_norm": 1.1547383069992065, + "learning_rate": 2.3809796094687475e-05, + "loss": 0.7053, + "step": 16461 + }, + { + "epoch": 0.2421866108633001, + "grad_norm": 1.0909677743911743, + "learning_rate": 2.372591577780202e-05, + "loss": 0.6987, + "step": 16492 + }, + { + "epoch": 0.24264184885364465, + "grad_norm": 1.0706005096435547, + "learning_rate": 2.3642049838783838e-05, + "loss": 0.7014, + "step": 16523 + }, + { + "epoch": 0.2430970868439892, + "grad_norm": 1.0389344692230225, + "learning_rate": 2.3558199224048666e-05, + "loss": 0.695, + "step": 16554 + }, + { + "epoch": 0.24355232483433376, + "grad_norm": 1.0836538076400757, + "learning_rate": 2.347436487983929e-05, + "loss": 0.7004, + "step": 16585 + }, + { + "epoch": 0.2440075628246783, + "grad_norm": 1.0748459100723267, + "learning_rate": 2.3390547752214888e-05, + "loss": 0.7018, + "step": 16616 + }, + { + "epoch": 0.24446280081502286, + "grad_norm": 1.097935438156128, + "learning_rate": 2.330674878704035e-05, + "loss": 0.706, + "step": 16647 + }, + { + "epoch": 0.24491803880536742, + "grad_norm": 1.1082520484924316, + "learning_rate": 2.322296892997561e-05, + "loss": 0.7012, + "step": 16678 + }, + { + "epoch": 0.24537327679571194, + "grad_norm": 1.0682934522628784, + "learning_rate": 2.313920912646497e-05, + "loss": 0.701, + "step": 16709 + }, + { + "epoch": 0.2458285147860565, + "grad_norm": 1.1116893291473389, + "learning_rate": 2.305547032172643e-05, + "loss": 0.7038, + "step": 16740 + }, + { + "epoch": 0.24628375277640105, + "grad_norm": 1.0376949310302734, + "learning_rate": 2.2971753460741014e-05, + "loss": 0.6998, + "step": 16771 + }, + { + "epoch": 0.2467389907667456, + "grad_norm": 1.0389093160629272, + "learning_rate": 2.288805948824212e-05, + "loss": 0.7043, + "step": 16802 + }, + { + "epoch": 0.24719422875709016, + "grad_norm": 1.0645474195480347, + "learning_rate": 2.2804389348704858e-05, + "loss": 0.6947, + "step": 16833 + }, + { + "epoch": 0.2476494667474347, + "grad_norm": 1.0893995761871338, + "learning_rate": 2.2720743986335374e-05, + "loss": 0.6981, + "step": 16864 + }, + { + "epoch": 0.24810470473777924, + "grad_norm": 1.022275447845459, + "learning_rate": 2.2637124345060233e-05, + "loss": 0.7081, + "step": 16895 + }, + { + "epoch": 0.2485599427281238, + "grad_norm": 1.1055867671966553, + "learning_rate": 2.2553531368515695e-05, + "loss": 0.6942, + "step": 16926 + }, + { + "epoch": 0.24901518071846834, + "grad_norm": 1.0815192461013794, + "learning_rate": 2.2469966000037144e-05, + "loss": 0.6951, + "step": 16957 + }, + { + "epoch": 0.2494704187088129, + "grad_norm": 1.0612388849258423, + "learning_rate": 2.2386429182648417e-05, + "loss": 0.6991, + "step": 16988 + }, + { + "epoch": 0.24992565669915745, + "grad_norm": 1.0434961318969727, + "learning_rate": 2.230292185905114e-05, + "loss": 0.6904, + "step": 17019 + }, + { + "epoch": 0.250380894689502, + "grad_norm": 1.0427175760269165, + "learning_rate": 2.2219444971614116e-05, + "loss": 0.6979, + "step": 17050 + }, + { + "epoch": 0.25083613267984656, + "grad_norm": 1.0715687274932861, + "learning_rate": 2.2135999462362655e-05, + "loss": 0.7034, + "step": 17081 + }, + { + "epoch": 0.2512913706701911, + "grad_norm": 1.0116679668426514, + "learning_rate": 2.2052586272968003e-05, + "loss": 0.6996, + "step": 17112 + }, + { + "epoch": 0.25174660866053566, + "grad_norm": 1.2103781700134277, + "learning_rate": 2.196920634473666e-05, + "loss": 0.7026, + "step": 17143 + }, + { + "epoch": 0.2522018466508802, + "grad_norm": 1.0434819459915161, + "learning_rate": 2.1885860618599787e-05, + "loss": 0.6979, + "step": 17174 + }, + { + "epoch": 0.2526570846412247, + "grad_norm": 1.2911967039108276, + "learning_rate": 2.1802550035102577e-05, + "loss": 0.6866, + "step": 17205 + }, + { + "epoch": 0.2531123226315693, + "grad_norm": 1.1720303297042847, + "learning_rate": 2.171927553439363e-05, + "loss": 0.6868, + "step": 17236 + }, + { + "epoch": 0.2535675606219138, + "grad_norm": 1.0302678346633911, + "learning_rate": 2.1636038056214376e-05, + "loss": 0.691, + "step": 17267 + }, + { + "epoch": 0.2540227986122584, + "grad_norm": 1.0190601348876953, + "learning_rate": 2.155283853988844e-05, + "loss": 0.6964, + "step": 17298 + }, + { + "epoch": 0.25447803660260293, + "grad_norm": 1.109703540802002, + "learning_rate": 2.146967792431106e-05, + "loss": 0.693, + "step": 17329 + }, + { + "epoch": 0.25493327459294746, + "grad_norm": 1.160040020942688, + "learning_rate": 2.138655714793849e-05, + "loss": 0.6943, + "step": 17360 + }, + { + "epoch": 0.25538851258329204, + "grad_norm": 1.083268404006958, + "learning_rate": 2.1303477148777367e-05, + "loss": 0.7024, + "step": 17391 + }, + { + "epoch": 0.25584375057363656, + "grad_norm": 1.0631040334701538, + "learning_rate": 2.122043886437421e-05, + "loss": 0.6896, + "step": 17422 + }, + { + "epoch": 0.25629898856398114, + "grad_norm": 1.2141170501708984, + "learning_rate": 2.1137443231804765e-05, + "loss": 0.7005, + "step": 17453 + }, + { + "epoch": 0.25675422655432567, + "grad_norm": 1.082511067390442, + "learning_rate": 2.105449118766347e-05, + "loss": 0.6906, + "step": 17484 + }, + { + "epoch": 0.25720946454467025, + "grad_norm": 0.9919353127479553, + "learning_rate": 2.097158366805287e-05, + "loss": 0.7017, + "step": 17515 + }, + { + "epoch": 0.2576647025350148, + "grad_norm": 1.0450084209442139, + "learning_rate": 2.0888721608573047e-05, + "loss": 0.696, + "step": 17546 + }, + { + "epoch": 0.2581199405253593, + "grad_norm": 1.0460536479949951, + "learning_rate": 2.0805905944311087e-05, + "loss": 0.6947, + "step": 17577 + }, + { + "epoch": 0.2585751785157039, + "grad_norm": 1.0822510719299316, + "learning_rate": 2.0723137609830497e-05, + "loss": 0.7039, + "step": 17608 + }, + { + "epoch": 0.2590304165060484, + "grad_norm": 1.0411216020584106, + "learning_rate": 2.0640417539160686e-05, + "loss": 0.6962, + "step": 17639 + }, + { + "epoch": 0.259485654496393, + "grad_norm": 1.0115315914154053, + "learning_rate": 2.0557746665786427e-05, + "loss": 0.6954, + "step": 17670 + }, + { + "epoch": 0.2599408924867375, + "grad_norm": 1.0552514791488647, + "learning_rate": 2.0475125922637256e-05, + "loss": 0.6881, + "step": 17701 + }, + { + "epoch": 0.26039613047708204, + "grad_norm": 0.9966985583305359, + "learning_rate": 2.0392556242077047e-05, + "loss": 0.7012, + "step": 17732 + }, + { + "epoch": 0.2608513684674266, + "grad_norm": 1.113692045211792, + "learning_rate": 2.031003855589343e-05, + "loss": 0.703, + "step": 17763 + }, + { + "epoch": 0.26130660645777115, + "grad_norm": 1.0169728994369507, + "learning_rate": 2.022757379528727e-05, + "loss": 0.7008, + "step": 17794 + }, + { + "epoch": 0.26176184444811573, + "grad_norm": 1.1313414573669434, + "learning_rate": 2.0145162890862184e-05, + "loss": 0.6956, + "step": 17825 + }, + { + "epoch": 0.26221708243846026, + "grad_norm": 0.9456464052200317, + "learning_rate": 2.0062806772614022e-05, + "loss": 0.694, + "step": 17856 + }, + { + "epoch": 0.26267232042880484, + "grad_norm": 1.0825542211532593, + "learning_rate": 1.9980506369920392e-05, + "loss": 0.6915, + "step": 17887 + }, + { + "epoch": 0.26312755841914937, + "grad_norm": 1.059581995010376, + "learning_rate": 1.989826261153015e-05, + "loss": 0.6922, + "step": 17918 + }, + { + "epoch": 0.2635827964094939, + "grad_norm": 1.0134432315826416, + "learning_rate": 1.9816076425552923e-05, + "loss": 0.6952, + "step": 17949 + }, + { + "epoch": 0.2640380343998385, + "grad_norm": 0.9800439476966858, + "learning_rate": 1.9733948739448676e-05, + "loss": 0.7036, + "step": 17980 + }, + { + "epoch": 0.264493272390183, + "grad_norm": 1.128818392753601, + "learning_rate": 1.9651880480017155e-05, + "loss": 0.6916, + "step": 18011 + }, + { + "epoch": 0.2649485103805276, + "grad_norm": 1.0002161264419556, + "learning_rate": 1.9569872573387516e-05, + "loss": 0.6886, + "step": 18042 + }, + { + "epoch": 0.2654037483708721, + "grad_norm": 1.1037601232528687, + "learning_rate": 1.9487925945007854e-05, + "loss": 0.6954, + "step": 18073 + }, + { + "epoch": 0.2658589863612167, + "grad_norm": 1.0204657316207886, + "learning_rate": 1.9406041519634726e-05, + "loss": 0.6976, + "step": 18104 + }, + { + "epoch": 0.2663142243515612, + "grad_norm": 1.0254517793655396, + "learning_rate": 1.932422022132275e-05, + "loss": 0.697, + "step": 18135 + }, + { + "epoch": 0.26676946234190574, + "grad_norm": 1.0792242288589478, + "learning_rate": 1.924246297341414e-05, + "loss": 0.6932, + "step": 18166 + }, + { + "epoch": 0.2672247003322503, + "grad_norm": 1.2440094947814941, + "learning_rate": 1.9160770698528338e-05, + "loss": 0.6925, + "step": 18197 + }, + { + "epoch": 0.26767993832259485, + "grad_norm": 1.0181853771209717, + "learning_rate": 1.907914431855156e-05, + "loss": 0.6854, + "step": 18228 + }, + { + "epoch": 0.2681351763129394, + "grad_norm": 0.982681930065155, + "learning_rate": 1.8997584754626412e-05, + "loss": 0.6892, + "step": 18259 + }, + { + "epoch": 0.26859041430328395, + "grad_norm": 1.1587820053100586, + "learning_rate": 1.8916092927141486e-05, + "loss": 0.6955, + "step": 18290 + }, + { + "epoch": 0.2690456522936285, + "grad_norm": 1.0297470092773438, + "learning_rate": 1.883466975572098e-05, + "loss": 0.6921, + "step": 18321 + }, + { + "epoch": 0.26950089028397306, + "grad_norm": 1.0646672248840332, + "learning_rate": 1.8753316159214312e-05, + "loss": 0.6966, + "step": 18352 + }, + { + "epoch": 0.2699561282743176, + "grad_norm": 1.0070273876190186, + "learning_rate": 1.8672033055685766e-05, + "loss": 0.7005, + "step": 18383 + }, + { + "epoch": 0.27041136626466217, + "grad_norm": 0.9793278574943542, + "learning_rate": 1.8590821362404116e-05, + "loss": 0.6894, + "step": 18414 + }, + { + "epoch": 0.2708666042550067, + "grad_norm": 1.0349115133285522, + "learning_rate": 1.8509681995832294e-05, + "loss": 0.6906, + "step": 18445 + }, + { + "epoch": 0.2713218422453513, + "grad_norm": 1.0271046161651611, + "learning_rate": 1.8428615871617004e-05, + "loss": 0.6916, + "step": 18476 + }, + { + "epoch": 0.2717770802356958, + "grad_norm": 0.9766640663146973, + "learning_rate": 1.8347623904578448e-05, + "loss": 0.692, + "step": 18507 + }, + { + "epoch": 0.2722323182260403, + "grad_norm": 1.0498918294906616, + "learning_rate": 1.8266707008699975e-05, + "loss": 0.687, + "step": 18538 + }, + { + "epoch": 0.2726875562163849, + "grad_norm": 0.970116138458252, + "learning_rate": 1.818586609711774e-05, + "loss": 0.6923, + "step": 18569 + }, + { + "epoch": 0.27314279420672943, + "grad_norm": 1.1822494268417358, + "learning_rate": 1.8105102082110462e-05, + "loss": 0.6899, + "step": 18600 + }, + { + "epoch": 0.273598032197074, + "grad_norm": 1.0538249015808105, + "learning_rate": 1.8024415875089058e-05, + "loss": 0.6912, + "step": 18631 + }, + { + "epoch": 0.27405327018741854, + "grad_norm": 1.123678207397461, + "learning_rate": 1.7943808386586407e-05, + "loss": 0.6918, + "step": 18662 + }, + { + "epoch": 0.27450850817776307, + "grad_norm": 1.0302077531814575, + "learning_rate": 1.7863280526247073e-05, + "loss": 0.6872, + "step": 18693 + }, + { + "epoch": 0.27496374616810765, + "grad_norm": 1.0867012739181519, + "learning_rate": 1.7782833202817003e-05, + "loss": 0.7006, + "step": 18724 + }, + { + "epoch": 0.2754189841584522, + "grad_norm": 1.0516695976257324, + "learning_rate": 1.7702467324133327e-05, + "loss": 0.6969, + "step": 18755 + }, + { + "epoch": 0.27587422214879675, + "grad_norm": 1.083567500114441, + "learning_rate": 1.7622183797114042e-05, + "loss": 0.6936, + "step": 18786 + }, + { + "epoch": 0.2763294601391413, + "grad_norm": 1.0399643182754517, + "learning_rate": 1.7541983527747838e-05, + "loss": 0.6887, + "step": 18817 + }, + { + "epoch": 0.27678469812948586, + "grad_norm": 1.1514192819595337, + "learning_rate": 1.746186742108387e-05, + "loss": 0.6882, + "step": 18848 + }, + { + "epoch": 0.2772399361198304, + "grad_norm": 1.1234108209609985, + "learning_rate": 1.73818363812215e-05, + "loss": 0.6909, + "step": 18879 + }, + { + "epoch": 0.2776951741101749, + "grad_norm": 1.0432260036468506, + "learning_rate": 1.7301891311300153e-05, + "loss": 0.6826, + "step": 18910 + }, + { + "epoch": 0.2781504121005195, + "grad_norm": 1.2708081007003784, + "learning_rate": 1.7222033113489055e-05, + "loss": 0.694, + "step": 18941 + }, + { + "epoch": 0.278605650090864, + "grad_norm": 0.9991064667701721, + "learning_rate": 1.7142262688977127e-05, + "loss": 0.7001, + "step": 18972 + }, + { + "epoch": 0.2790608880812086, + "grad_norm": 1.103553295135498, + "learning_rate": 1.7062580937962764e-05, + "loss": 0.6974, + "step": 19003 + }, + { + "epoch": 0.27951612607155313, + "grad_norm": 1.0002790689468384, + "learning_rate": 1.698298875964369e-05, + "loss": 0.6951, + "step": 19034 + }, + { + "epoch": 0.27997136406189765, + "grad_norm": 1.0627328157424927, + "learning_rate": 1.690348705220684e-05, + "loss": 0.6914, + "step": 19065 + }, + { + "epoch": 0.28042660205224224, + "grad_norm": 1.152733325958252, + "learning_rate": 1.6824076712818156e-05, + "loss": 0.6909, + "step": 19096 + }, + { + "epoch": 0.28088184004258676, + "grad_norm": 1.1142559051513672, + "learning_rate": 1.6744758637612533e-05, + "loss": 0.6878, + "step": 19127 + }, + { + "epoch": 0.28133707803293134, + "grad_norm": 1.022026538848877, + "learning_rate": 1.6665533721683664e-05, + "loss": 0.6876, + "step": 19158 + }, + { + "epoch": 0.28179231602327587, + "grad_norm": 1.117065668106079, + "learning_rate": 1.6586402859073974e-05, + "loss": 0.6878, + "step": 19189 + }, + { + "epoch": 0.28224755401362045, + "grad_norm": 0.9499729871749878, + "learning_rate": 1.6507366942764463e-05, + "loss": 0.6888, + "step": 19220 + }, + { + "epoch": 0.282702792003965, + "grad_norm": 1.111111044883728, + "learning_rate": 1.6428426864664732e-05, + "loss": 0.6898, + "step": 19251 + }, + { + "epoch": 0.2831580299943095, + "grad_norm": 1.1620928049087524, + "learning_rate": 1.6349583515602816e-05, + "loss": 0.6948, + "step": 19282 + }, + { + "epoch": 0.2836132679846541, + "grad_norm": 1.1431219577789307, + "learning_rate": 1.6270837785315208e-05, + "loss": 0.6929, + "step": 19313 + }, + { + "epoch": 0.2840685059749986, + "grad_norm": 1.1274683475494385, + "learning_rate": 1.619219056243676e-05, + "loss": 0.6823, + "step": 19344 + }, + { + "epoch": 0.2845237439653432, + "grad_norm": 1.1499154567718506, + "learning_rate": 1.6113642734490698e-05, + "loss": 0.6838, + "step": 19375 + }, + { + "epoch": 0.2849789819556877, + "grad_norm": 1.0493180751800537, + "learning_rate": 1.6035195187878577e-05, + "loss": 0.6867, + "step": 19406 + }, + { + "epoch": 0.2854342199460323, + "grad_norm": 0.9728123545646667, + "learning_rate": 1.5956848807870305e-05, + "loss": 0.6889, + "step": 19437 + }, + { + "epoch": 0.2858894579363768, + "grad_norm": 1.0137308835983276, + "learning_rate": 1.587860447859413e-05, + "loss": 0.6892, + "step": 19468 + }, + { + "epoch": 0.28634469592672135, + "grad_norm": 1.0865050554275513, + "learning_rate": 1.5800463083026686e-05, + "loss": 0.6841, + "step": 19499 + }, + { + "epoch": 0.28679993391706593, + "grad_norm": 1.0522550344467163, + "learning_rate": 1.572242550298298e-05, + "loss": 0.6905, + "step": 19530 + }, + { + "epoch": 0.28725517190741046, + "grad_norm": 1.1563197374343872, + "learning_rate": 1.56444926191065e-05, + "loss": 0.6811, + "step": 19561 + }, + { + "epoch": 0.28771040989775504, + "grad_norm": 0.962688684463501, + "learning_rate": 1.5566665310859257e-05, + "loss": 0.6898, + "step": 19592 + }, + { + "epoch": 0.28816564788809956, + "grad_norm": 1.0998531579971313, + "learning_rate": 1.5488944456511846e-05, + "loss": 0.6909, + "step": 19623 + }, + { + "epoch": 0.2886208858784441, + "grad_norm": 1.1609821319580078, + "learning_rate": 1.5411330933133546e-05, + "loss": 0.6844, + "step": 19654 + }, + { + "epoch": 0.28907612386878867, + "grad_norm": 0.9745819568634033, + "learning_rate": 1.533382561658241e-05, + "loss": 0.6933, + "step": 19685 + }, + { + "epoch": 0.2895313618591332, + "grad_norm": 1.085925817489624, + "learning_rate": 1.525642938149541e-05, + "loss": 0.6894, + "step": 19716 + }, + { + "epoch": 0.2899865998494778, + "grad_norm": 1.0314606428146362, + "learning_rate": 1.5179143101278536e-05, + "loss": 0.6965, + "step": 19747 + }, + { + "epoch": 0.2904418378398223, + "grad_norm": 1.0771900415420532, + "learning_rate": 1.5101967648096955e-05, + "loss": 0.6904, + "step": 19778 + }, + { + "epoch": 0.2908970758301669, + "grad_norm": 0.9729062914848328, + "learning_rate": 1.5024903892865172e-05, + "loss": 0.6886, + "step": 19809 + }, + { + "epoch": 0.2913523138205114, + "grad_norm": 1.0824676752090454, + "learning_rate": 1.4947952705237184e-05, + "loss": 0.6804, + "step": 19840 + }, + { + "epoch": 0.29180755181085594, + "grad_norm": 1.0260144472122192, + "learning_rate": 1.4871114953596682e-05, + "loss": 0.6905, + "step": 19871 + }, + { + "epoch": 0.2922627898012005, + "grad_norm": 0.9324101209640503, + "learning_rate": 1.4794391505047256e-05, + "loss": 0.6877, + "step": 19902 + }, + { + "epoch": 0.29271802779154504, + "grad_norm": 1.0553687810897827, + "learning_rate": 1.4717783225402596e-05, + "loss": 0.6928, + "step": 19933 + }, + { + "epoch": 0.2931732657818896, + "grad_norm": 1.129400610923767, + "learning_rate": 1.4641290979176735e-05, + "loss": 0.6905, + "step": 19964 + }, + { + "epoch": 0.29362850377223415, + "grad_norm": 1.064041018486023, + "learning_rate": 1.4564915629574246e-05, + "loss": 0.6936, + "step": 19995 + }, + { + "epoch": 0.2940837417625787, + "grad_norm": 1.116929292678833, + "learning_rate": 1.4488658038480601e-05, + "loss": 0.6818, + "step": 20026 + }, + { + "epoch": 0.29453897975292326, + "grad_norm": 1.0334928035736084, + "learning_rate": 1.4412519066452323e-05, + "loss": 0.6887, + "step": 20057 + }, + { + "epoch": 0.2949942177432678, + "grad_norm": 1.0690734386444092, + "learning_rate": 1.4336499572707373e-05, + "loss": 0.6885, + "step": 20088 + }, + { + "epoch": 0.29544945573361237, + "grad_norm": 1.1211203336715698, + "learning_rate": 1.4260600415115433e-05, + "loss": 0.6919, + "step": 20119 + }, + { + "epoch": 0.2959046937239569, + "grad_norm": 0.9984875917434692, + "learning_rate": 1.4184822450188137e-05, + "loss": 0.6892, + "step": 20150 + }, + { + "epoch": 0.29635993171430147, + "grad_norm": 1.0159475803375244, + "learning_rate": 1.410916653306954e-05, + "loss": 0.682, + "step": 20181 + }, + { + "epoch": 0.296815169704646, + "grad_norm": 0.9778633117675781, + "learning_rate": 1.403363351752639e-05, + "loss": 0.6808, + "step": 20212 + }, + { + "epoch": 0.2972704076949905, + "grad_norm": 1.1207058429718018, + "learning_rate": 1.3958224255938485e-05, + "loss": 0.6852, + "step": 20243 + }, + { + "epoch": 0.2977256456853351, + "grad_norm": 1.0286227464675903, + "learning_rate": 1.388293959928911e-05, + "loss": 0.6886, + "step": 20274 + }, + { + "epoch": 0.29818088367567963, + "grad_norm": 1.0112954378128052, + "learning_rate": 1.3807780397155379e-05, + "loss": 0.6878, + "step": 20305 + }, + { + "epoch": 0.2986361216660242, + "grad_norm": 1.0683724880218506, + "learning_rate": 1.3732747497698655e-05, + "loss": 0.6889, + "step": 20336 + }, + { + "epoch": 0.29909135965636874, + "grad_norm": 1.0744072198867798, + "learning_rate": 1.3657841747655038e-05, + "loss": 0.6791, + "step": 20367 + }, + { + "epoch": 0.2995465976467133, + "grad_norm": 1.0279752016067505, + "learning_rate": 1.3583063992325706e-05, + "loss": 0.684, + "step": 20398 + }, + { + "epoch": 0.30000183563705785, + "grad_norm": 0.9995334148406982, + "learning_rate": 1.3508415075567496e-05, + "loss": 0.6906, + "step": 20429 + }, + { + "epoch": 0.30045707362740237, + "grad_norm": 1.351607322692871, + "learning_rate": 1.343389583978327e-05, + "loss": 0.6964, + "step": 20460 + }, + { + "epoch": 0.30091231161774695, + "grad_norm": 1.0838359594345093, + "learning_rate": 1.3359507125912468e-05, + "loss": 0.6784, + "step": 20491 + }, + { + "epoch": 0.3013675496080915, + "grad_norm": 1.0536307096481323, + "learning_rate": 1.3285249773421627e-05, + "loss": 0.6872, + "step": 20522 + }, + { + "epoch": 0.30182278759843606, + "grad_norm": 0.9636529088020325, + "learning_rate": 1.3211124620294884e-05, + "loss": 0.6914, + "step": 20553 + }, + { + "epoch": 0.3022780255887806, + "grad_norm": 1.1852017641067505, + "learning_rate": 1.313713250302451e-05, + "loss": 0.6821, + "step": 20584 + }, + { + "epoch": 0.3027332635791251, + "grad_norm": 1.072434425354004, + "learning_rate": 1.3063274256601479e-05, + "loss": 0.695, + "step": 20615 + }, + { + "epoch": 0.3031885015694697, + "grad_norm": 1.2345269918441772, + "learning_rate": 1.2989550714506086e-05, + "loss": 0.6824, + "step": 20646 + }, + { + "epoch": 0.3036437395598142, + "grad_norm": 1.0516636371612549, + "learning_rate": 1.291596270869846e-05, + "loss": 0.6854, + "step": 20677 + }, + { + "epoch": 0.3040989775501588, + "grad_norm": 1.0413544178009033, + "learning_rate": 1.284251106960927e-05, + "loss": 0.6895, + "step": 20708 + }, + { + "epoch": 0.3045542155405033, + "grad_norm": 1.158065676689148, + "learning_rate": 1.2769196626130263e-05, + "loss": 0.6861, + "step": 20739 + }, + { + "epoch": 0.3050094535308479, + "grad_norm": 1.0109269618988037, + "learning_rate": 1.2696020205604969e-05, + "loss": 0.6898, + "step": 20770 + }, + { + "epoch": 0.30546469152119243, + "grad_norm": 0.9886858463287354, + "learning_rate": 1.2622982633819359e-05, + "loss": 0.6874, + "step": 20801 + }, + { + "epoch": 0.30591992951153696, + "grad_norm": 1.0234347581863403, + "learning_rate": 1.2550084734992484e-05, + "loss": 0.6823, + "step": 20832 + }, + { + "epoch": 0.30637516750188154, + "grad_norm": 1.028950810432434, + "learning_rate": 1.247732733176724e-05, + "loss": 0.6884, + "step": 20863 + }, + { + "epoch": 0.30683040549222607, + "grad_norm": 1.1941654682159424, + "learning_rate": 1.2404711245201044e-05, + "loss": 0.7019, + "step": 20894 + }, + { + "epoch": 0.30728564348257065, + "grad_norm": 1.0201176404953003, + "learning_rate": 1.2332237294756535e-05, + "loss": 0.6846, + "step": 20925 + }, + { + "epoch": 0.3077408814729152, + "grad_norm": 0.9765841364860535, + "learning_rate": 1.225990629829241e-05, + "loss": 0.6881, + "step": 20956 + }, + { + "epoch": 0.3081961194632597, + "grad_norm": 1.0036793947219849, + "learning_rate": 1.2187719072054136e-05, + "loss": 0.6849, + "step": 20987 + }, + { + "epoch": 0.3086513574536043, + "grad_norm": 1.1151163578033447, + "learning_rate": 1.2115676430664735e-05, + "loss": 0.6825, + "step": 21018 + }, + { + "epoch": 0.3091065954439488, + "grad_norm": 1.0734307765960693, + "learning_rate": 1.2043779187115647e-05, + "loss": 0.6902, + "step": 21049 + }, + { + "epoch": 0.3095618334342934, + "grad_norm": 0.9811964631080627, + "learning_rate": 1.1972028152757476e-05, + "loss": 0.6883, + "step": 21080 + }, + { + "epoch": 0.3100170714246379, + "grad_norm": 1.0949833393096924, + "learning_rate": 1.1900424137290889e-05, + "loss": 0.6873, + "step": 21111 + }, + { + "epoch": 0.3104723094149825, + "grad_norm": 1.0459587574005127, + "learning_rate": 1.1828967948757482e-05, + "loss": 0.6853, + "step": 21142 + }, + { + "epoch": 0.310927547405327, + "grad_norm": 1.1628592014312744, + "learning_rate": 1.175766039353062e-05, + "loss": 0.6837, + "step": 21173 + }, + { + "epoch": 0.31138278539567155, + "grad_norm": 0.9916526079177856, + "learning_rate": 1.1686502276306382e-05, + "loss": 0.6838, + "step": 21204 + }, + { + "epoch": 0.3118380233860161, + "grad_norm": 0.9945309162139893, + "learning_rate": 1.1615494400094445e-05, + "loss": 0.6811, + "step": 21235 + }, + { + "epoch": 0.31229326137636065, + "grad_norm": 1.0234261751174927, + "learning_rate": 1.1544637566209029e-05, + "loss": 0.6833, + "step": 21266 + }, + { + "epoch": 0.31274849936670523, + "grad_norm": 0.999071478843689, + "learning_rate": 1.1473932574259886e-05, + "loss": 0.6858, + "step": 21297 + }, + { + "epoch": 0.31320373735704976, + "grad_norm": 1.0478752851486206, + "learning_rate": 1.1403380222143247e-05, + "loss": 0.6918, + "step": 21328 + }, + { + "epoch": 0.3136589753473943, + "grad_norm": 1.083009958267212, + "learning_rate": 1.1332981306032808e-05, + "loss": 0.6758, + "step": 21359 + }, + { + "epoch": 0.31411421333773887, + "grad_norm": 0.9705089926719666, + "learning_rate": 1.1262736620370762e-05, + "loss": 0.6784, + "step": 21390 + }, + { + "epoch": 0.3145694513280834, + "grad_norm": 0.9727345108985901, + "learning_rate": 1.1192646957858854e-05, + "loss": 0.6902, + "step": 21421 + }, + { + "epoch": 0.315024689318428, + "grad_norm": 1.1719439029693604, + "learning_rate": 1.1122713109449381e-05, + "loss": 0.6802, + "step": 21452 + }, + { + "epoch": 0.3154799273087725, + "grad_norm": 1.061924695968628, + "learning_rate": 1.105293586433634e-05, + "loss": 0.6829, + "step": 21483 + }, + { + "epoch": 0.3159351652991171, + "grad_norm": 0.965242326259613, + "learning_rate": 1.0983316009946446e-05, + "loss": 0.6795, + "step": 21514 + }, + { + "epoch": 0.3163904032894616, + "grad_norm": 1.0916402339935303, + "learning_rate": 1.0913854331930282e-05, + "loss": 0.689, + "step": 21545 + }, + { + "epoch": 0.31684564127980613, + "grad_norm": 1.088815450668335, + "learning_rate": 1.0844551614153456e-05, + "loss": 0.6845, + "step": 21576 + }, + { + "epoch": 0.3173008792701507, + "grad_norm": 1.052106499671936, + "learning_rate": 1.0775408638687725e-05, + "loss": 0.68, + "step": 21607 + }, + { + "epoch": 0.31775611726049524, + "grad_norm": 0.9820737242698669, + "learning_rate": 1.0706426185802165e-05, + "loss": 0.6872, + "step": 21638 + }, + { + "epoch": 0.3182113552508398, + "grad_norm": 1.014233946800232, + "learning_rate": 1.0637605033954371e-05, + "loss": 0.6891, + "step": 21669 + }, + { + "epoch": 0.31866659324118435, + "grad_norm": 1.098426103591919, + "learning_rate": 1.05689459597817e-05, + "loss": 0.6878, + "step": 21700 + }, + { + "epoch": 0.31912183123152893, + "grad_norm": 1.0153820514678955, + "learning_rate": 1.050044973809246e-05, + "loss": 0.6793, + "step": 21731 + }, + { + "epoch": 0.31957706922187346, + "grad_norm": 1.07473886013031, + "learning_rate": 1.043211714185722e-05, + "loss": 0.6892, + "step": 21762 + }, + { + "epoch": 0.320032307212218, + "grad_norm": 1.108799934387207, + "learning_rate": 1.036394894220003e-05, + "loss": 0.6819, + "step": 21793 + }, + { + "epoch": 0.32048754520256256, + "grad_norm": 1.105481743812561, + "learning_rate": 1.0295945908389751e-05, + "loss": 0.6871, + "step": 21824 + }, + { + "epoch": 0.3209427831929071, + "grad_norm": 1.106384038925171, + "learning_rate": 1.0228108807831393e-05, + "loss": 0.6789, + "step": 21855 + }, + { + "epoch": 0.32139802118325167, + "grad_norm": 1.0353236198425293, + "learning_rate": 1.01604384060574e-05, + "loss": 0.6822, + "step": 21886 + }, + { + "epoch": 0.3218532591735962, + "grad_norm": 1.086665153503418, + "learning_rate": 1.009293546671907e-05, + "loss": 0.6804, + "step": 21917 + }, + { + "epoch": 0.3223084971639407, + "grad_norm": 0.982736349105835, + "learning_rate": 1.002560075157791e-05, + "loss": 0.6838, + "step": 21948 + }, + { + "epoch": 0.3227637351542853, + "grad_norm": 1.1834845542907715, + "learning_rate": 9.958435020496995e-06, + "loss": 0.6881, + "step": 21979 + }, + { + "epoch": 0.32321897314462983, + "grad_norm": 1.0267136096954346, + "learning_rate": 9.89143903143249e-06, + "loss": 0.6814, + "step": 22010 + }, + { + "epoch": 0.3236742111349744, + "grad_norm": 1.0347238779067993, + "learning_rate": 9.824613540425038e-06, + "loss": 0.6806, + "step": 22041 + }, + { + "epoch": 0.32412944912531894, + "grad_norm": 1.0527032613754272, + "learning_rate": 9.757959301591197e-06, + "loss": 0.6791, + "step": 22072 + }, + { + "epoch": 0.3245846871156635, + "grad_norm": 1.0493974685668945, + "learning_rate": 9.691477067115017e-06, + "loss": 0.69, + "step": 22103 + }, + { + "epoch": 0.32503992510600804, + "grad_norm": 1.0073857307434082, + "learning_rate": 9.625167587239467e-06, + "loss": 0.6781, + "step": 22134 + }, + { + "epoch": 0.32549516309635257, + "grad_norm": 0.9913318157196045, + "learning_rate": 9.559031610258007e-06, + "loss": 0.6825, + "step": 22165 + }, + { + "epoch": 0.32595040108669715, + "grad_norm": 1.0183086395263672, + "learning_rate": 9.493069882506164e-06, + "loss": 0.6805, + "step": 22196 + }, + { + "epoch": 0.3264056390770417, + "grad_norm": 1.0087958574295044, + "learning_rate": 9.427283148353056e-06, + "loss": 0.6775, + "step": 22227 + }, + { + "epoch": 0.32686087706738626, + "grad_norm": 1.1444545984268188, + "learning_rate": 9.361672150193052e-06, + "loss": 0.684, + "step": 22258 + }, + { + "epoch": 0.3273161150577308, + "grad_norm": 1.0593520402908325, + "learning_rate": 9.29623762843734e-06, + "loss": 0.6874, + "step": 22289 + }, + { + "epoch": 0.3277713530480753, + "grad_norm": 1.0576874017715454, + "learning_rate": 9.230980321505594e-06, + "loss": 0.6789, + "step": 22320 + }, + { + "epoch": 0.3282265910384199, + "grad_norm": 0.9419311285018921, + "learning_rate": 9.165900965817668e-06, + "loss": 0.6727, + "step": 22351 + }, + { + "epoch": 0.3286818290287644, + "grad_norm": 1.116232991218567, + "learning_rate": 9.101000295785245e-06, + "loss": 0.6868, + "step": 22382 + }, + { + "epoch": 0.329137067019109, + "grad_norm": 1.1253653764724731, + "learning_rate": 9.036279043803565e-06, + "loss": 0.6782, + "step": 22413 + }, + { + "epoch": 0.3295923050094535, + "grad_norm": 1.1574054956436157, + "learning_rate": 8.971737940243147e-06, + "loss": 0.6801, + "step": 22444 + }, + { + "epoch": 0.3300475429997981, + "grad_norm": 0.9954296946525574, + "learning_rate": 8.907377713441592e-06, + "loss": 0.6815, + "step": 22475 + }, + { + "epoch": 0.33050278099014263, + "grad_norm": 1.0231496095657349, + "learning_rate": 8.843199089695293e-06, + "loss": 0.6828, + "step": 22506 + }, + { + "epoch": 0.33095801898048716, + "grad_norm": 1.0164254903793335, + "learning_rate": 8.779202793251311e-06, + "loss": 0.6806, + "step": 22537 + }, + { + "epoch": 0.33141325697083174, + "grad_norm": 0.9951460361480713, + "learning_rate": 8.715389546299149e-06, + "loss": 0.6838, + "step": 22568 + }, + { + "epoch": 0.33186849496117626, + "grad_norm": 1.117965817451477, + "learning_rate": 8.651760068962617e-06, + "loss": 0.6726, + "step": 22599 + }, + { + "epoch": 0.33232373295152084, + "grad_norm": 0.9313582181930542, + "learning_rate": 8.588315079291733e-06, + "loss": 0.6807, + "step": 22630 + }, + { + "epoch": 0.33277897094186537, + "grad_norm": 1.0607514381408691, + "learning_rate": 8.52505529325457e-06, + "loss": 0.6757, + "step": 22661 + }, + { + "epoch": 0.3332342089322099, + "grad_norm": 0.9300371408462524, + "learning_rate": 8.461981424729216e-06, + "loss": 0.6715, + "step": 22692 + }, + { + "epoch": 0.3336894469225545, + "grad_norm": 1.0345282554626465, + "learning_rate": 8.399094185495725e-06, + "loss": 0.6814, + "step": 22723 + }, + { + "epoch": 0.334144684912899, + "grad_norm": 1.008501648902893, + "learning_rate": 8.336394285228017e-06, + "loss": 0.6874, + "step": 22754 + }, + { + "epoch": 0.3345999229032436, + "grad_norm": 1.2125533819198608, + "learning_rate": 8.273882431485952e-06, + "loss": 0.6822, + "step": 22785 + }, + { + "epoch": 0.3350551608935881, + "grad_norm": 1.0801039934158325, + "learning_rate": 8.211559329707316e-06, + "loss": 0.6934, + "step": 22816 + }, + { + "epoch": 0.3355103988839327, + "grad_norm": 1.054120659828186, + "learning_rate": 8.149425683199823e-06, + "loss": 0.673, + "step": 22847 + }, + { + "epoch": 0.3359656368742772, + "grad_norm": 1.0891995429992676, + "learning_rate": 8.08748219313325e-06, + "loss": 0.6852, + "step": 22878 + }, + { + "epoch": 0.33642087486462174, + "grad_norm": 1.0153359174728394, + "learning_rate": 8.025729558531453e-06, + "loss": 0.6838, + "step": 22909 + }, + { + "epoch": 0.3368761128549663, + "grad_norm": 1.0803651809692383, + "learning_rate": 7.964168476264508e-06, + "loss": 0.6832, + "step": 22940 + }, + { + "epoch": 0.33733135084531085, + "grad_norm": 1.0524797439575195, + "learning_rate": 7.902799641040884e-06, + "loss": 0.6818, + "step": 22971 + }, + { + "epoch": 0.33778658883565543, + "grad_norm": 1.1119606494903564, + "learning_rate": 7.841623745399523e-06, + "loss": 0.6853, + "step": 23002 + }, + { + "epoch": 0.33824182682599996, + "grad_norm": 0.9859051704406738, + "learning_rate": 7.780641479702114e-06, + "loss": 0.6865, + "step": 23033 + }, + { + "epoch": 0.33869706481634454, + "grad_norm": 0.9103122353553772, + "learning_rate": 7.719853532125227e-06, + "loss": 0.6766, + "step": 23064 + }, + { + "epoch": 0.33915230280668907, + "grad_norm": 1.023934245109558, + "learning_rate": 7.65926058865258e-06, + "loss": 0.6781, + "step": 23095 + }, + { + "epoch": 0.3396075407970336, + "grad_norm": 1.0698734521865845, + "learning_rate": 7.598863333067313e-06, + "loss": 0.6848, + "step": 23126 + }, + { + "epoch": 0.3400627787873782, + "grad_norm": 1.1564706563949585, + "learning_rate": 7.538662446944253e-06, + "loss": 0.6868, + "step": 23157 + }, + { + "epoch": 0.3405180167777227, + "grad_norm": 1.114890456199646, + "learning_rate": 7.478658609642211e-06, + "loss": 0.6785, + "step": 23188 + }, + { + "epoch": 0.3409732547680673, + "grad_norm": 1.1149795055389404, + "learning_rate": 7.418852498296327e-06, + "loss": 0.6706, + "step": 23219 + }, + { + "epoch": 0.3414284927584118, + "grad_norm": 1.0689568519592285, + "learning_rate": 7.359244787810457e-06, + "loss": 0.6751, + "step": 23250 + }, + { + "epoch": 0.34188373074875633, + "grad_norm": 1.1450313329696655, + "learning_rate": 7.299836150849493e-06, + "loss": 0.6708, + "step": 23281 + }, + { + "epoch": 0.3423389687391009, + "grad_norm": 1.1399791240692139, + "learning_rate": 7.240627257831847e-06, + "loss": 0.6781, + "step": 23312 + }, + { + "epoch": 0.34279420672944544, + "grad_norm": 1.131117343902588, + "learning_rate": 7.1816187769218195e-06, + "loss": 0.6767, + "step": 23343 + }, + { + "epoch": 0.34324944471979, + "grad_norm": 1.0069117546081543, + "learning_rate": 7.1228113740220895e-06, + "loss": 0.6795, + "step": 23374 + }, + { + "epoch": 0.34370468271013455, + "grad_norm": 1.0576585531234741, + "learning_rate": 7.064205712766226e-06, + "loss": 0.685, + "step": 23405 + }, + { + "epoch": 0.3441599207004791, + "grad_norm": 1.1254713535308838, + "learning_rate": 7.005802454511129e-06, + "loss": 0.6772, + "step": 23436 + }, + { + "epoch": 0.34461515869082365, + "grad_norm": 1.032549262046814, + "learning_rate": 6.947602258329639e-06, + "loss": 0.6777, + "step": 23467 + }, + { + "epoch": 0.3450703966811682, + "grad_norm": 1.019902229309082, + "learning_rate": 6.889605781003078e-06, + "loss": 0.6814, + "step": 23498 + }, + { + "epoch": 0.34552563467151276, + "grad_norm": 1.0798234939575195, + "learning_rate": 6.831813677013776e-06, + "loss": 0.671, + "step": 23529 + }, + { + "epoch": 0.3459808726618573, + "grad_norm": 1.0329113006591797, + "learning_rate": 6.774226598537792e-06, + "loss": 0.6767, + "step": 23560 + }, + { + "epoch": 0.34643611065220187, + "grad_norm": 1.0345433950424194, + "learning_rate": 6.716845195437482e-06, + "loss": 0.6739, + "step": 23591 + }, + { + "epoch": 0.3468913486425464, + "grad_norm": 0.9708922505378723, + "learning_rate": 6.659670115254168e-06, + "loss": 0.6793, + "step": 23622 + }, + { + "epoch": 0.3473465866328909, + "grad_norm": 1.035715103149414, + "learning_rate": 6.602702003200872e-06, + "loss": 0.6847, + "step": 23653 + }, + { + "epoch": 0.3478018246232355, + "grad_norm": 0.9764015078544617, + "learning_rate": 6.545941502154992e-06, + "loss": 0.6815, + "step": 23684 + }, + { + "epoch": 0.34825706261358, + "grad_norm": 1.033187747001648, + "learning_rate": 6.489389252651057e-06, + "loss": 0.6715, + "step": 23715 + }, + { + "epoch": 0.3487123006039246, + "grad_norm": 1.0181931257247925, + "learning_rate": 6.4330458928735325e-06, + "loss": 0.6758, + "step": 23746 + }, + { + "epoch": 0.34916753859426913, + "grad_norm": 1.0105568170547485, + "learning_rate": 6.376912058649559e-06, + "loss": 0.6746, + "step": 23777 + }, + { + "epoch": 0.3496227765846137, + "grad_norm": 1.0559037923812866, + "learning_rate": 6.320988383441845e-06, + "loss": 0.6795, + "step": 23808 + }, + { + "epoch": 0.35007801457495824, + "grad_norm": 1.0579489469528198, + "learning_rate": 6.265275498341452e-06, + "loss": 0.6812, + "step": 23839 + }, + { + "epoch": 0.35053325256530277, + "grad_norm": 1.0675939321517944, + "learning_rate": 6.209774032060714e-06, + "loss": 0.6769, + "step": 23870 + }, + { + "epoch": 0.35098849055564735, + "grad_norm": 0.8914999961853027, + "learning_rate": 6.1544846109261365e-06, + "loss": 0.6825, + "step": 23901 + }, + { + "epoch": 0.3514437285459919, + "grad_norm": 0.9489038586616516, + "learning_rate": 6.099407858871342e-06, + "loss": 0.6801, + "step": 23932 + }, + { + "epoch": 0.35189896653633645, + "grad_norm": 1.040340781211853, + "learning_rate": 6.044544397429958e-06, + "loss": 0.6813, + "step": 23963 + }, + { + "epoch": 0.352354204526681, + "grad_norm": 1.2235099077224731, + "learning_rate": 5.989894845728708e-06, + "loss": 0.6742, + "step": 23994 + }, + { + "epoch": 0.3528094425170255, + "grad_norm": 1.0165103673934937, + "learning_rate": 5.9354598204803605e-06, + "loss": 0.6768, + "step": 24025 + }, + { + "epoch": 0.3532646805073701, + "grad_norm": 1.023253321647644, + "learning_rate": 5.881239935976762e-06, + "loss": 0.678, + "step": 24056 + }, + { + "epoch": 0.3537199184977146, + "grad_norm": 1.0616728067398071, + "learning_rate": 5.827235804081954e-06, + "loss": 0.6723, + "step": 24087 + }, + { + "epoch": 0.3541751564880592, + "grad_norm": 1.021304726600647, + "learning_rate": 5.773448034225221e-06, + "loss": 0.677, + "step": 24118 + }, + { + "epoch": 0.3546303944784037, + "grad_norm": 1.055176854133606, + "learning_rate": 5.719877233394228e-06, + "loss": 0.6779, + "step": 24149 + }, + { + "epoch": 0.3550856324687483, + "grad_norm": 1.0145021677017212, + "learning_rate": 5.666524006128191e-06, + "loss": 0.679, + "step": 24180 + }, + { + "epoch": 0.35554087045909283, + "grad_norm": 1.0869520902633667, + "learning_rate": 5.613388954511015e-06, + "loss": 0.6728, + "step": 24211 + }, + { + "epoch": 0.35599610844943735, + "grad_norm": 0.9280300140380859, + "learning_rate": 5.560472678164552e-06, + "loss": 0.6794, + "step": 24242 + }, + { + "epoch": 0.35645134643978194, + "grad_norm": 0.9949683547019958, + "learning_rate": 5.507775774241775e-06, + "loss": 0.6771, + "step": 24273 + }, + { + "epoch": 0.35690658443012646, + "grad_norm": 1.0003410577774048, + "learning_rate": 5.4552988374200945e-06, + "loss": 0.6789, + "step": 24304 + }, + { + "epoch": 0.35736182242047104, + "grad_norm": 1.0053478479385376, + "learning_rate": 5.403042459894597e-06, + "loss": 0.6783, + "step": 24335 + }, + { + "epoch": 0.35781706041081557, + "grad_norm": 0.992439866065979, + "learning_rate": 5.3510072313714135e-06, + "loss": 0.6734, + "step": 24366 + }, + { + "epoch": 0.35827229840116015, + "grad_norm": 1.0443801879882812, + "learning_rate": 5.2991937390610205e-06, + "loss": 0.6833, + "step": 24397 + } + ], + "logging_steps": 31, + "max_steps": 30517, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 3052, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.8013248630772728e+19, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-24416/training_args.bin b/checkpoint-24416/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..dd0c30645908fd59ad9d6091c5e2e08018856b49 --- /dev/null +++ b/checkpoint-24416/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2d2ae676cb4770c8405bd6fc6962f1ef1289d669b20872c9a947682a2673db22 +size 5304 diff --git a/checkpoint-27468/config.json b/checkpoint-27468/config.json new file mode 100644 index 0000000000000000000000000000000000000000..1a2e378a7091e0c0c8eaa10d0bd7796835fe4b64 --- /dev/null +++ b/checkpoint-27468/config.json @@ -0,0 +1,39 @@ +{ + "_name_or_path": "meta-llama/Llama-3.1-8B-Instruct", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 8.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "torch_dtype": "float32", + "transformers_version": "4.44.0.dev0", + "use_cache": true, + "vocab_size": 128256 +} diff --git a/checkpoint-27468/generation_config.json b/checkpoint-27468/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c0d952a3697e90a24460d766a494a8bb9cf80534 --- /dev/null +++ b/checkpoint-27468/generation_config.json @@ -0,0 +1,12 @@ +{ + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.44.0.dev0" +} diff --git a/checkpoint-27468/model-00001-of-00007.safetensors b/checkpoint-27468/model-00001-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..baf6f5a98a3c70a45d32d773b31388360d0a4bff --- /dev/null +++ b/checkpoint-27468/model-00001-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:96a5b957df6d41139b4a7ccfd129e7ad26244ab800881ba827b4012b1e26d881 +size 4886466168 diff --git a/checkpoint-27468/model-00002-of-00007.safetensors b/checkpoint-27468/model-00002-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..fbc8e97904ed5acdba6240901a52d7ea3c73628d --- /dev/null +++ b/checkpoint-27468/model-00002-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:23fa5aaf8ac56c17c821980c9aee44fcbc2a617fc570e875e6fdb93d92886c72 +size 4832007448 diff --git a/checkpoint-27468/model-00003-of-00007.safetensors b/checkpoint-27468/model-00003-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..aeab8b855338147792249baf9c680bfb29ea42a5 --- /dev/null +++ b/checkpoint-27468/model-00003-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a223e87240e50dfb05634ea909cf1ec715d37a12b681b14a3a6e97f28688f2b +size 4999813112 diff --git a/checkpoint-27468/model-00004-of-00007.safetensors b/checkpoint-27468/model-00004-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ac9e7b6b5edbe63eec2dd4c7d3310db1f10fdda9 --- /dev/null +++ b/checkpoint-27468/model-00004-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3047d199c611c6d12b1cca780f033ad89df9c076f742674f7ac4a892ca9573b3 +size 4999813128 diff --git a/checkpoint-27468/model-00005-of-00007.safetensors b/checkpoint-27468/model-00005-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..48718503a952853ffd5e231b9dba74b7026e0e01 --- /dev/null +++ b/checkpoint-27468/model-00005-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a2f91d749d664ecd22c71f3d54488966fc6899a03924b8fea3613c8f83398a8 +size 4832007496 diff --git a/checkpoint-27468/model-00006-of-00007.safetensors b/checkpoint-27468/model-00006-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3e329509d1444681e82f188ae282f344a8794507 --- /dev/null +++ b/checkpoint-27468/model-00006-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab1644e47886548c2e37269adb11c15b86ebd6fda159578fb3befbede208fe55 +size 4999813120 diff --git a/checkpoint-27468/model-00007-of-00007.safetensors b/checkpoint-27468/model-00007-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3d2dce2df093a724dfbf0bac7bb6670b3e847f42 --- /dev/null +++ b/checkpoint-27468/model-00007-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b0498d6cc42bb1a7e874b493b270f130390cbcb80cca7d7f294e14ef71f8ca3 +size 2571158184 diff --git a/checkpoint-27468/model.safetensors.index.json b/checkpoint-27468/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..0985084320f65a678bea622edb50d3771af4fd13 --- /dev/null +++ b/checkpoint-27468/model.safetensors.index.json @@ -0,0 +1,298 @@ +{ + "metadata": { + "total_size": 32121044992 + }, + "weight_map": { + "lm_head.weight": "model-00007-of-00007.safetensors", + "model.embed_tokens.weight": "model-00001-of-00007.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.10.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.15.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.20.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.21.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.26.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.3.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.30.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.input_layernorm.weight": "model-00007-of-00007.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00007-of-00007.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00007-of-00007.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00007-of-00007.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.4.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.9.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.norm.weight": "model-00007-of-00007.safetensors" + } +} diff --git a/checkpoint-27468/optimizer.pt b/checkpoint-27468/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..11f5fdb2bc90dda303117a733194e8435a4d4af9 --- /dev/null +++ b/checkpoint-27468/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1875d9ce75873f77115aa998206015fcba4201f86c6af5098270167e37b812e4 +size 15385036334 diff --git a/checkpoint-27468/rng_state.pth b/checkpoint-27468/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..897ad19948758b40c785c678c136c4029433a27a --- /dev/null +++ b/checkpoint-27468/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d9cd6a0487226e5bd30d1846894c82af483733ab4381b75bae9c0745e05d405 +size 14244 diff --git a/checkpoint-27468/scheduler.pt b/checkpoint-27468/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..ec6a33a1e16fa727d72d8610d56b97fd04ba15e3 --- /dev/null +++ b/checkpoint-27468/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3d47b007e64bffbb0dc51c02560ea2fea14f1ab5035228332be1bd00a38697eb +size 1064 diff --git a/checkpoint-27468/trainer_state.json b/checkpoint-27468/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..9d0c00e5f4a8cd3a350f890723c1d09c9dda3987 --- /dev/null +++ b/checkpoint-27468/trainer_state.json @@ -0,0 +1,6235 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.4033702296381959, + "eval_steps": 500, + "global_step": 27468, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0004552379903445491, + "grad_norm": 4.0020222663879395, + "learning_rate": 1.0157273918741808e-06, + "loss": 0.92, + "step": 31 + }, + { + "epoch": 0.0009104759806890982, + "grad_norm": 3.06746506690979, + "learning_rate": 2.0314547837483616e-06, + "loss": 0.8898, + "step": 62 + }, + { + "epoch": 0.0013657139710336473, + "grad_norm": 3.004636764526367, + "learning_rate": 3.0471821756225426e-06, + "loss": 0.8653, + "step": 93 + }, + { + "epoch": 0.0018209519613781964, + "grad_norm": 3.0733370780944824, + "learning_rate": 4.062909567496723e-06, + "loss": 0.8516, + "step": 124 + }, + { + "epoch": 0.0022761899517227454, + "grad_norm": 2.898719549179077, + "learning_rate": 5.078636959370905e-06, + "loss": 0.8461, + "step": 155 + }, + { + "epoch": 0.0027314279420672946, + "grad_norm": 2.8273258209228516, + "learning_rate": 6.094364351245085e-06, + "loss": 0.8362, + "step": 186 + }, + { + "epoch": 0.0031866659324118435, + "grad_norm": 2.9983041286468506, + "learning_rate": 7.110091743119267e-06, + "loss": 0.8274, + "step": 217 + }, + { + "epoch": 0.0036419039227563927, + "grad_norm": 2.9561767578125, + "learning_rate": 8.125819134993446e-06, + "loss": 0.8373, + "step": 248 + }, + { + "epoch": 0.0040971419131009415, + "grad_norm": 2.732726812362671, + "learning_rate": 9.141546526867629e-06, + "loss": 0.8342, + "step": 279 + }, + { + "epoch": 0.004552379903445491, + "grad_norm": 2.8236358165740967, + "learning_rate": 1.015727391874181e-05, + "loss": 0.8387, + "step": 310 + }, + { + "epoch": 0.00500761789379004, + "grad_norm": 2.8730266094207764, + "learning_rate": 1.117300131061599e-05, + "loss": 0.8328, + "step": 341 + }, + { + "epoch": 0.005462855884134589, + "grad_norm": 2.717439651489258, + "learning_rate": 1.218872870249017e-05, + "loss": 0.8316, + "step": 372 + }, + { + "epoch": 0.005918093874479138, + "grad_norm": 2.9625961780548096, + "learning_rate": 1.3204456094364351e-05, + "loss": 0.8295, + "step": 403 + }, + { + "epoch": 0.006373331864823687, + "grad_norm": 2.6649422645568848, + "learning_rate": 1.4220183486238533e-05, + "loss": 0.8356, + "step": 434 + }, + { + "epoch": 0.006828569855168236, + "grad_norm": 2.9066264629364014, + "learning_rate": 1.5235910878112714e-05, + "loss": 0.8313, + "step": 465 + }, + { + "epoch": 0.0072838078455127854, + "grad_norm": 2.7710695266723633, + "learning_rate": 1.6251638269986893e-05, + "loss": 0.8251, + "step": 496 + }, + { + "epoch": 0.007739045835857335, + "grad_norm": 2.416724443435669, + "learning_rate": 1.7267365661861077e-05, + "loss": 0.8319, + "step": 527 + }, + { + "epoch": 0.008194283826201883, + "grad_norm": 2.583005905151367, + "learning_rate": 1.8283093053735257e-05, + "loss": 0.8265, + "step": 558 + }, + { + "epoch": 0.008649521816546433, + "grad_norm": 2.6601760387420654, + "learning_rate": 1.9298820445609438e-05, + "loss": 0.8328, + "step": 589 + }, + { + "epoch": 0.009104759806890982, + "grad_norm": 2.9271795749664307, + "learning_rate": 2.031454783748362e-05, + "loss": 0.8282, + "step": 620 + }, + { + "epoch": 0.00955999779723553, + "grad_norm": 2.3631091117858887, + "learning_rate": 2.13302752293578e-05, + "loss": 0.8314, + "step": 651 + }, + { + "epoch": 0.01001523578758008, + "grad_norm": 2.4002573490142822, + "learning_rate": 2.234600262123198e-05, + "loss": 0.829, + "step": 682 + }, + { + "epoch": 0.010470473777924628, + "grad_norm": 2.629352331161499, + "learning_rate": 2.336173001310616e-05, + "loss": 0.8315, + "step": 713 + }, + { + "epoch": 0.010925711768269179, + "grad_norm": 2.3604094982147217, + "learning_rate": 2.437745740498034e-05, + "loss": 0.8338, + "step": 744 + }, + { + "epoch": 0.011380949758613727, + "grad_norm": 2.8708078861236572, + "learning_rate": 2.5393184796854525e-05, + "loss": 0.843, + "step": 775 + }, + { + "epoch": 0.011836187748958275, + "grad_norm": 2.7403945922851562, + "learning_rate": 2.6408912188728702e-05, + "loss": 0.8372, + "step": 806 + }, + { + "epoch": 0.012291425739302825, + "grad_norm": 2.733816623687744, + "learning_rate": 2.7424639580602886e-05, + "loss": 0.8327, + "step": 837 + }, + { + "epoch": 0.012746663729647374, + "grad_norm": 2.2656960487365723, + "learning_rate": 2.8440366972477066e-05, + "loss": 0.8357, + "step": 868 + }, + { + "epoch": 0.013201901719991924, + "grad_norm": 2.18038010597229, + "learning_rate": 2.9456094364351244e-05, + "loss": 0.8331, + "step": 899 + }, + { + "epoch": 0.013657139710336472, + "grad_norm": 2.497558832168579, + "learning_rate": 3.0471821756225428e-05, + "loss": 0.8271, + "step": 930 + }, + { + "epoch": 0.01411237770068102, + "grad_norm": 2.322632074356079, + "learning_rate": 3.148754914809961e-05, + "loss": 0.8327, + "step": 961 + }, + { + "epoch": 0.014567615691025571, + "grad_norm": 2.5596141815185547, + "learning_rate": 3.2503276539973785e-05, + "loss": 0.8367, + "step": 992 + }, + { + "epoch": 0.01502285368137012, + "grad_norm": 2.1262409687042236, + "learning_rate": 3.351900393184797e-05, + "loss": 0.8192, + "step": 1023 + }, + { + "epoch": 0.01547809167171467, + "grad_norm": 2.2483584880828857, + "learning_rate": 3.453473132372215e-05, + "loss": 0.8352, + "step": 1054 + }, + { + "epoch": 0.015933329662059216, + "grad_norm": 2.224043130874634, + "learning_rate": 3.555045871559633e-05, + "loss": 0.8362, + "step": 1085 + }, + { + "epoch": 0.016388567652403766, + "grad_norm": 2.104788303375244, + "learning_rate": 3.6566186107470514e-05, + "loss": 0.832, + "step": 1116 + }, + { + "epoch": 0.016843805642748316, + "grad_norm": 2.126499652862549, + "learning_rate": 3.7581913499344695e-05, + "loss": 0.8341, + "step": 1147 + }, + { + "epoch": 0.017299043633092866, + "grad_norm": 2.221691370010376, + "learning_rate": 3.8597640891218876e-05, + "loss": 0.8346, + "step": 1178 + }, + { + "epoch": 0.017754281623437413, + "grad_norm": 2.4445159435272217, + "learning_rate": 3.9613368283093056e-05, + "loss": 0.8415, + "step": 1209 + }, + { + "epoch": 0.018209519613781963, + "grad_norm": 2.321516752243042, + "learning_rate": 4.062909567496724e-05, + "loss": 0.8289, + "step": 1240 + }, + { + "epoch": 0.018664757604126513, + "grad_norm": 2.2673656940460205, + "learning_rate": 4.164482306684142e-05, + "loss": 0.8307, + "step": 1271 + }, + { + "epoch": 0.01911999559447106, + "grad_norm": 2.1191513538360596, + "learning_rate": 4.26605504587156e-05, + "loss": 0.8319, + "step": 1302 + }, + { + "epoch": 0.01957523358481561, + "grad_norm": 2.18985652923584, + "learning_rate": 4.367627785058978e-05, + "loss": 0.8398, + "step": 1333 + }, + { + "epoch": 0.02003047157516016, + "grad_norm": 1.9449113607406616, + "learning_rate": 4.469200524246396e-05, + "loss": 0.8304, + "step": 1364 + }, + { + "epoch": 0.020485709565504707, + "grad_norm": 2.0119357109069824, + "learning_rate": 4.570773263433814e-05, + "loss": 0.82, + "step": 1395 + }, + { + "epoch": 0.020940947555849257, + "grad_norm": 2.0013489723205566, + "learning_rate": 4.672346002621232e-05, + "loss": 0.8275, + "step": 1426 + }, + { + "epoch": 0.021396185546193807, + "grad_norm": 2.0175704956054688, + "learning_rate": 4.77391874180865e-05, + "loss": 0.8348, + "step": 1457 + }, + { + "epoch": 0.021851423536538357, + "grad_norm": 2.0555551052093506, + "learning_rate": 4.875491480996068e-05, + "loss": 0.8322, + "step": 1488 + }, + { + "epoch": 0.022306661526882904, + "grad_norm": 1.9871079921722412, + "learning_rate": 4.977064220183487e-05, + "loss": 0.8253, + "step": 1519 + }, + { + "epoch": 0.022761899517227454, + "grad_norm": 2.1991310119628906, + "learning_rate": 4.9999915451558777e-05, + "loss": 0.8252, + "step": 1550 + }, + { + "epoch": 0.023217137507572004, + "grad_norm": 1.96790611743927, + "learning_rate": 4.999955597496219e-05, + "loss": 0.8242, + "step": 1581 + }, + { + "epoch": 0.02367237549791655, + "grad_norm": 2.0572164058685303, + "learning_rate": 4.9998914381774255e-05, + "loss": 0.833, + "step": 1612 + }, + { + "epoch": 0.0241276134882611, + "grad_norm": 2.0381035804748535, + "learning_rate": 4.999799067923527e-05, + "loss": 0.8351, + "step": 1643 + }, + { + "epoch": 0.02458285147860565, + "grad_norm": 2.145235061645508, + "learning_rate": 4.999678487776908e-05, + "loss": 0.842, + "step": 1674 + }, + { + "epoch": 0.025038089468950198, + "grad_norm": 2.0120856761932373, + "learning_rate": 4.9995296990983006e-05, + "loss": 0.838, + "step": 1705 + }, + { + "epoch": 0.025493327459294748, + "grad_norm": 1.9461947679519653, + "learning_rate": 4.999352703566763e-05, + "loss": 0.8269, + "step": 1736 + }, + { + "epoch": 0.025948565449639298, + "grad_norm": 1.9189060926437378, + "learning_rate": 4.999147503179668e-05, + "loss": 0.8214, + "step": 1767 + }, + { + "epoch": 0.026403803439983848, + "grad_norm": 1.906604290008545, + "learning_rate": 4.998914100252672e-05, + "loss": 0.8249, + "step": 1798 + }, + { + "epoch": 0.026859041430328395, + "grad_norm": 2.0810904502868652, + "learning_rate": 4.998652497419696e-05, + "loss": 0.8143, + "step": 1829 + }, + { + "epoch": 0.027314279420672945, + "grad_norm": 1.9607690572738647, + "learning_rate": 4.9983626976328927e-05, + "loss": 0.8155, + "step": 1860 + }, + { + "epoch": 0.027769517411017495, + "grad_norm": 1.8492732048034668, + "learning_rate": 4.998044704162613e-05, + "loss": 0.8168, + "step": 1891 + }, + { + "epoch": 0.02822475540136204, + "grad_norm": 2.0616824626922607, + "learning_rate": 4.9976985205973705e-05, + "loss": 0.8279, + "step": 1922 + }, + { + "epoch": 0.02867999339170659, + "grad_norm": 1.882591724395752, + "learning_rate": 4.997324150843799e-05, + "loss": 0.8199, + "step": 1953 + }, + { + "epoch": 0.029135231382051142, + "grad_norm": 1.8001904487609863, + "learning_rate": 4.99692159912661e-05, + "loss": 0.8232, + "step": 1984 + }, + { + "epoch": 0.02959046937239569, + "grad_norm": 1.94768226146698, + "learning_rate": 4.996490869988546e-05, + "loss": 0.817, + "step": 2015 + }, + { + "epoch": 0.03004570736274024, + "grad_norm": 1.845452904701233, + "learning_rate": 4.996031968290326e-05, + "loss": 0.8146, + "step": 2046 + }, + { + "epoch": 0.03050094535308479, + "grad_norm": 1.8649152517318726, + "learning_rate": 4.995544899210594e-05, + "loss": 0.8246, + "step": 2077 + }, + { + "epoch": 0.03095618334342934, + "grad_norm": 1.665781021118164, + "learning_rate": 4.9950296682458583e-05, + "loss": 0.8004, + "step": 2108 + }, + { + "epoch": 0.031411421333773885, + "grad_norm": 1.6481060981750488, + "learning_rate": 4.994486281210429e-05, + "loss": 0.7949, + "step": 2139 + }, + { + "epoch": 0.03186665932411843, + "grad_norm": 1.7481175661087036, + "learning_rate": 4.9939147442363566e-05, + "loss": 0.8142, + "step": 2170 + }, + { + "epoch": 0.032321897314462986, + "grad_norm": 1.6130069494247437, + "learning_rate": 4.9933150637733574e-05, + "loss": 0.8126, + "step": 2201 + }, + { + "epoch": 0.03277713530480753, + "grad_norm": 1.7536373138427734, + "learning_rate": 4.992687246588743e-05, + "loss": 0.8149, + "step": 2232 + }, + { + "epoch": 0.033232373295152086, + "grad_norm": 1.721200942993164, + "learning_rate": 4.992031299767347e-05, + "loss": 0.8051, + "step": 2263 + }, + { + "epoch": 0.03368761128549663, + "grad_norm": 3.3350446224212646, + "learning_rate": 4.9913472307114386e-05, + "loss": 0.8007, + "step": 2294 + }, + { + "epoch": 0.03414284927584118, + "grad_norm": 1.848281741142273, + "learning_rate": 4.9906350471406446e-05, + "loss": 0.8076, + "step": 2325 + }, + { + "epoch": 0.03459808726618573, + "grad_norm": 1.6283164024353027, + "learning_rate": 4.989894757091861e-05, + "loss": 0.8084, + "step": 2356 + }, + { + "epoch": 0.03505332525653028, + "grad_norm": 1.6567096710205078, + "learning_rate": 4.989126368919158e-05, + "loss": 0.8075, + "step": 2387 + }, + { + "epoch": 0.035508563246874826, + "grad_norm": 1.6488817930221558, + "learning_rate": 4.988329891293693e-05, + "loss": 0.7994, + "step": 2418 + }, + { + "epoch": 0.03596380123721938, + "grad_norm": 1.9465230703353882, + "learning_rate": 4.987505333203608e-05, + "loss": 0.8064, + "step": 2449 + }, + { + "epoch": 0.036419039227563926, + "grad_norm": 1.722517967224121, + "learning_rate": 4.9866527039539276e-05, + "loss": 0.8005, + "step": 2480 + }, + { + "epoch": 0.03687427721790847, + "grad_norm": 1.8509281873703003, + "learning_rate": 4.9857720131664594e-05, + "loss": 0.8094, + "step": 2511 + }, + { + "epoch": 0.03732951520825303, + "grad_norm": 1.7140249013900757, + "learning_rate": 4.9848632707796773e-05, + "loss": 0.8018, + "step": 2542 + }, + { + "epoch": 0.03778475319859757, + "grad_norm": 1.9315160512924194, + "learning_rate": 4.9839264870486155e-05, + "loss": 0.8094, + "step": 2573 + }, + { + "epoch": 0.03823999118894212, + "grad_norm": 1.728063702583313, + "learning_rate": 4.9829616725447526e-05, + "loss": 0.8052, + "step": 2604 + }, + { + "epoch": 0.03869522917928667, + "grad_norm": 1.588205099105835, + "learning_rate": 4.981968838155888e-05, + "loss": 0.7983, + "step": 2635 + }, + { + "epoch": 0.03915046716963122, + "grad_norm": 1.6086421012878418, + "learning_rate": 4.980947995086024e-05, + "loss": 0.7961, + "step": 2666 + }, + { + "epoch": 0.03960570515997577, + "grad_norm": 1.680635929107666, + "learning_rate": 4.979899154855234e-05, + "loss": 0.8033, + "step": 2697 + }, + { + "epoch": 0.04006094315032032, + "grad_norm": 1.6833465099334717, + "learning_rate": 4.9788223292995386e-05, + "loss": 0.793, + "step": 2728 + }, + { + "epoch": 0.04051618114066487, + "grad_norm": 1.5738922357559204, + "learning_rate": 4.977717530570768e-05, + "loss": 0.7837, + "step": 2759 + }, + { + "epoch": 0.040971419131009414, + "grad_norm": 1.6513015031814575, + "learning_rate": 4.976584771136425e-05, + "loss": 0.7969, + "step": 2790 + }, + { + "epoch": 0.04142665712135397, + "grad_norm": 1.7180182933807373, + "learning_rate": 4.975424063779547e-05, + "loss": 0.7994, + "step": 2821 + }, + { + "epoch": 0.041881895111698514, + "grad_norm": 1.5236577987670898, + "learning_rate": 4.974235421598557e-05, + "loss": 0.7952, + "step": 2852 + }, + { + "epoch": 0.04233713310204307, + "grad_norm": 1.760188341140747, + "learning_rate": 4.973018858007122e-05, + "loss": 0.795, + "step": 2883 + }, + { + "epoch": 0.042792371092387614, + "grad_norm": 1.5314075946807861, + "learning_rate": 4.9717743867339963e-05, + "loss": 0.7884, + "step": 2914 + }, + { + "epoch": 0.04324760908273216, + "grad_norm": 1.7256252765655518, + "learning_rate": 4.9705020218228695e-05, + "loss": 0.7883, + "step": 2945 + }, + { + "epoch": 0.043702847073076714, + "grad_norm": 1.6262825727462769, + "learning_rate": 4.969201777632205e-05, + "loss": 0.8007, + "step": 2976 + }, + { + "epoch": 0.04415808506342126, + "grad_norm": 1.4939130544662476, + "learning_rate": 4.9678736688350846e-05, + "loss": 0.7862, + "step": 3007 + }, + { + "epoch": 0.04461332305376581, + "grad_norm": 1.6776609420776367, + "learning_rate": 4.966517710419033e-05, + "loss": 0.7977, + "step": 3038 + }, + { + "epoch": 0.04506856104411036, + "grad_norm": 1.7425099611282349, + "learning_rate": 4.965133917685858e-05, + "loss": 0.789, + "step": 3069 + }, + { + "epoch": 0.04552379903445491, + "grad_norm": 1.5114233493804932, + "learning_rate": 4.9637223062514714e-05, + "loss": 0.7915, + "step": 3100 + }, + { + "epoch": 0.045979037024799455, + "grad_norm": 1.6397335529327393, + "learning_rate": 4.962282892045718e-05, + "loss": 0.8127, + "step": 3131 + }, + { + "epoch": 0.04643427501514401, + "grad_norm": 1.7266038656234741, + "learning_rate": 4.9608156913121904e-05, + "loss": 0.827, + "step": 3162 + }, + { + "epoch": 0.046889513005488555, + "grad_norm": 1.5216429233551025, + "learning_rate": 4.959320720608049e-05, + "loss": 0.8146, + "step": 3193 + }, + { + "epoch": 0.0473447509958331, + "grad_norm": 1.9235813617706299, + "learning_rate": 4.9577979968038354e-05, + "loss": 0.8118, + "step": 3224 + }, + { + "epoch": 0.047799988986177655, + "grad_norm": 1.6768368482589722, + "learning_rate": 4.956247537083282e-05, + "loss": 0.7925, + "step": 3255 + }, + { + "epoch": 0.0482552269765222, + "grad_norm": 1.6357700824737549, + "learning_rate": 4.9546693589431145e-05, + "loss": 0.7743, + "step": 3286 + }, + { + "epoch": 0.04871046496686675, + "grad_norm": 1.6823676824569702, + "learning_rate": 4.9530634801928595e-05, + "loss": 0.7949, + "step": 3317 + }, + { + "epoch": 0.0491657029572113, + "grad_norm": 1.4716497659683228, + "learning_rate": 4.9514299189546395e-05, + "loss": 0.7755, + "step": 3348 + }, + { + "epoch": 0.04962094094755585, + "grad_norm": 3.1948273181915283, + "learning_rate": 4.949768693662973e-05, + "loss": 0.7867, + "step": 3379 + }, + { + "epoch": 0.050076178937900395, + "grad_norm": 1.7412703037261963, + "learning_rate": 4.948079823064559e-05, + "loss": 0.788, + "step": 3410 + }, + { + "epoch": 0.05053141692824495, + "grad_norm": 1.5544873476028442, + "learning_rate": 4.946363326218074e-05, + "loss": 0.7752, + "step": 3441 + }, + { + "epoch": 0.050986654918589495, + "grad_norm": 1.5616456270217896, + "learning_rate": 4.9446192224939525e-05, + "loss": 0.7752, + "step": 3472 + }, + { + "epoch": 0.05144189290893404, + "grad_norm": 1.4451948404312134, + "learning_rate": 4.942847531574167e-05, + "loss": 0.7786, + "step": 3503 + }, + { + "epoch": 0.051897130899278596, + "grad_norm": 1.5473634004592896, + "learning_rate": 4.941048273452008e-05, + "loss": 0.7824, + "step": 3534 + }, + { + "epoch": 0.05235236888962314, + "grad_norm": 1.4772653579711914, + "learning_rate": 4.9392214684318605e-05, + "loss": 0.7873, + "step": 3565 + }, + { + "epoch": 0.052807606879967696, + "grad_norm": 1.4708329439163208, + "learning_rate": 4.93736713712897e-05, + "loss": 0.7851, + "step": 3596 + }, + { + "epoch": 0.05326284487031224, + "grad_norm": 1.469369649887085, + "learning_rate": 4.9354853004692124e-05, + "loss": 0.7874, + "step": 3627 + }, + { + "epoch": 0.05371808286065679, + "grad_norm": 1.3600234985351562, + "learning_rate": 4.93357597968886e-05, + "loss": 0.7792, + "step": 3658 + }, + { + "epoch": 0.05417332085100134, + "grad_norm": 1.6479856967926025, + "learning_rate": 4.931639196334338e-05, + "loss": 0.7886, + "step": 3689 + }, + { + "epoch": 0.05462855884134589, + "grad_norm": 1.5576198101043701, + "learning_rate": 4.9296749722619826e-05, + "loss": 0.7873, + "step": 3720 + }, + { + "epoch": 0.055083796831690436, + "grad_norm": 1.5949997901916504, + "learning_rate": 4.9276833296377966e-05, + "loss": 0.7794, + "step": 3751 + }, + { + "epoch": 0.05553903482203499, + "grad_norm": 1.3033227920532227, + "learning_rate": 4.925664290937196e-05, + "loss": 0.7763, + "step": 3782 + }, + { + "epoch": 0.055994272812379536, + "grad_norm": 1.5083190202713013, + "learning_rate": 4.9236178789447576e-05, + "loss": 0.7796, + "step": 3813 + }, + { + "epoch": 0.05644951080272408, + "grad_norm": 1.4691548347473145, + "learning_rate": 4.921544116753962e-05, + "loss": 0.7744, + "step": 3844 + }, + { + "epoch": 0.05690474879306864, + "grad_norm": 1.4447206258773804, + "learning_rate": 4.919443027766935e-05, + "loss": 0.7812, + "step": 3875 + }, + { + "epoch": 0.05735998678341318, + "grad_norm": 1.4616878032684326, + "learning_rate": 4.91731463569418e-05, + "loss": 0.7824, + "step": 3906 + }, + { + "epoch": 0.05781522477375773, + "grad_norm": 1.4808290004730225, + "learning_rate": 4.915158964554312e-05, + "loss": 0.7777, + "step": 3937 + }, + { + "epoch": 0.058270462764102283, + "grad_norm": 1.4712798595428467, + "learning_rate": 4.912976038673786e-05, + "loss": 0.7801, + "step": 3968 + }, + { + "epoch": 0.05872570075444683, + "grad_norm": 1.4899425506591797, + "learning_rate": 4.9107658826866254e-05, + "loss": 0.7852, + "step": 3999 + }, + { + "epoch": 0.05918093874479138, + "grad_norm": 1.2921983003616333, + "learning_rate": 4.908528521534139e-05, + "loss": 0.7735, + "step": 4030 + }, + { + "epoch": 0.05963617673513593, + "grad_norm": 1.3849194049835205, + "learning_rate": 4.906263980464644e-05, + "loss": 0.7893, + "step": 4061 + }, + { + "epoch": 0.06009141472548048, + "grad_norm": 1.442808747291565, + "learning_rate": 4.903972285033178e-05, + "loss": 0.784, + "step": 4092 + }, + { + "epoch": 0.060546652715825024, + "grad_norm": 1.4925036430358887, + "learning_rate": 4.901653461101213e-05, + "loss": 0.7714, + "step": 4123 + }, + { + "epoch": 0.06100189070616958, + "grad_norm": 1.4630024433135986, + "learning_rate": 4.8993075348363626e-05, + "loss": 0.7677, + "step": 4154 + }, + { + "epoch": 0.061457128696514124, + "grad_norm": 1.344081163406372, + "learning_rate": 4.896934532712084e-05, + "loss": 0.7788, + "step": 4185 + }, + { + "epoch": 0.06191236668685868, + "grad_norm": 1.449648380279541, + "learning_rate": 4.8945344815073846e-05, + "loss": 0.7661, + "step": 4216 + }, + { + "epoch": 0.062367604677203224, + "grad_norm": 1.450361728668213, + "learning_rate": 4.892107408306516e-05, + "loss": 0.7698, + "step": 4247 + }, + { + "epoch": 0.06282284266754777, + "grad_norm": 1.5068191289901733, + "learning_rate": 4.889653340498669e-05, + "loss": 0.7758, + "step": 4278 + }, + { + "epoch": 0.06327808065789232, + "grad_norm": 1.3638824224472046, + "learning_rate": 4.8871723057776664e-05, + "loss": 0.7702, + "step": 4309 + }, + { + "epoch": 0.06373331864823686, + "grad_norm": 1.4669525623321533, + "learning_rate": 4.8846643321416476e-05, + "loss": 0.7755, + "step": 4340 + }, + { + "epoch": 0.06418855663858142, + "grad_norm": 1.502684235572815, + "learning_rate": 4.882129447892753e-05, + "loss": 0.7677, + "step": 4371 + }, + { + "epoch": 0.06464379462892597, + "grad_norm": 1.5910828113555908, + "learning_rate": 4.8795676816368076e-05, + "loss": 0.7646, + "step": 4402 + }, + { + "epoch": 0.06509903261927051, + "grad_norm": 1.4607537984848022, + "learning_rate": 4.876979062282995e-05, + "loss": 0.7684, + "step": 4433 + }, + { + "epoch": 0.06555427060961506, + "grad_norm": 1.3823063373565674, + "learning_rate": 4.8743636190435325e-05, + "loss": 0.7695, + "step": 4464 + }, + { + "epoch": 0.06600950859995962, + "grad_norm": 1.3769381046295166, + "learning_rate": 4.871721381433344e-05, + "loss": 0.7746, + "step": 4495 + }, + { + "epoch": 0.06646474659030417, + "grad_norm": 1.3154246807098389, + "learning_rate": 4.869052379269719e-05, + "loss": 0.7699, + "step": 4526 + }, + { + "epoch": 0.06691998458064871, + "grad_norm": 1.4395617246627808, + "learning_rate": 4.866356642671985e-05, + "loss": 0.7654, + "step": 4557 + }, + { + "epoch": 0.06737522257099327, + "grad_norm": 1.333095908164978, + "learning_rate": 4.8636342020611634e-05, + "loss": 0.7729, + "step": 4588 + }, + { + "epoch": 0.06783046056133782, + "grad_norm": 1.3427667617797852, + "learning_rate": 4.860885088159626e-05, + "loss": 0.7701, + "step": 4619 + }, + { + "epoch": 0.06828569855168236, + "grad_norm": 1.4095017910003662, + "learning_rate": 4.858109331990751e-05, + "loss": 0.7711, + "step": 4650 + }, + { + "epoch": 0.06874093654202691, + "grad_norm": 1.4199680089950562, + "learning_rate": 4.855306964878567e-05, + "loss": 0.7797, + "step": 4681 + }, + { + "epoch": 0.06919617453237147, + "grad_norm": 1.3505098819732666, + "learning_rate": 4.8524780184474084e-05, + "loss": 0.7702, + "step": 4712 + }, + { + "epoch": 0.069651412522716, + "grad_norm": 1.3765984773635864, + "learning_rate": 4.8496225246215496e-05, + "loss": 0.7647, + "step": 4743 + }, + { + "epoch": 0.07010665051306056, + "grad_norm": 1.539703369140625, + "learning_rate": 4.8467405156248505e-05, + "loss": 0.7673, + "step": 4774 + }, + { + "epoch": 0.07056188850340511, + "grad_norm": 1.6388696432113647, + "learning_rate": 4.843832023980392e-05, + "loss": 0.7672, + "step": 4805 + }, + { + "epoch": 0.07101712649374965, + "grad_norm": 1.8714436292648315, + "learning_rate": 4.840897082510106e-05, + "loss": 0.7661, + "step": 4836 + }, + { + "epoch": 0.0714723644840942, + "grad_norm": 1.5175424814224243, + "learning_rate": 4.8379357243344084e-05, + "loss": 0.7709, + "step": 4867 + }, + { + "epoch": 0.07192760247443876, + "grad_norm": 1.3103501796722412, + "learning_rate": 4.8349479828718236e-05, + "loss": 0.7701, + "step": 4898 + }, + { + "epoch": 0.0723828404647833, + "grad_norm": 1.4692376852035522, + "learning_rate": 4.8319338918386075e-05, + "loss": 0.7602, + "step": 4929 + }, + { + "epoch": 0.07283807845512785, + "grad_norm": 1.390416145324707, + "learning_rate": 4.828893485248369e-05, + "loss": 0.7896, + "step": 4960 + }, + { + "epoch": 0.0732933164454724, + "grad_norm": 1.4157830476760864, + "learning_rate": 4.825826797411682e-05, + "loss": 0.7675, + "step": 4991 + }, + { + "epoch": 0.07374855443581695, + "grad_norm": 1.2405292987823486, + "learning_rate": 4.822733862935702e-05, + "loss": 0.7679, + "step": 5022 + }, + { + "epoch": 0.0742037924261615, + "grad_norm": 1.4902536869049072, + "learning_rate": 4.819614716723775e-05, + "loss": 0.7693, + "step": 5053 + }, + { + "epoch": 0.07465903041650605, + "grad_norm": 1.4391045570373535, + "learning_rate": 4.8164693939750425e-05, + "loss": 0.7699, + "step": 5084 + }, + { + "epoch": 0.07511426840685059, + "grad_norm": 1.4432806968688965, + "learning_rate": 4.813297930184042e-05, + "loss": 0.763, + "step": 5115 + }, + { + "epoch": 0.07556950639719515, + "grad_norm": 1.3630146980285645, + "learning_rate": 4.810100361140314e-05, + "loss": 0.7618, + "step": 5146 + }, + { + "epoch": 0.0760247443875397, + "grad_norm": 1.5038788318634033, + "learning_rate": 4.8068767229279885e-05, + "loss": 0.7685, + "step": 5177 + }, + { + "epoch": 0.07647998237788424, + "grad_norm": 1.4100326299667358, + "learning_rate": 4.8036270519253854e-05, + "loss": 0.7645, + "step": 5208 + }, + { + "epoch": 0.0769352203682288, + "grad_norm": 1.377821445465088, + "learning_rate": 4.8003513848046e-05, + "loss": 0.7643, + "step": 5239 + }, + { + "epoch": 0.07739045835857335, + "grad_norm": 1.3848881721496582, + "learning_rate": 4.79704975853109e-05, + "loss": 0.7711, + "step": 5270 + }, + { + "epoch": 0.07784569634891789, + "grad_norm": 1.6327494382858276, + "learning_rate": 4.793722210363262e-05, + "loss": 0.7591, + "step": 5301 + }, + { + "epoch": 0.07830093433926244, + "grad_norm": 1.51273512840271, + "learning_rate": 4.7903687778520414e-05, + "loss": 0.7697, + "step": 5332 + }, + { + "epoch": 0.078756172329607, + "grad_norm": 1.447487235069275, + "learning_rate": 4.7869894988404593e-05, + "loss": 0.7602, + "step": 5363 + }, + { + "epoch": 0.07921141031995153, + "grad_norm": 1.3227125406265259, + "learning_rate": 4.783584411463221e-05, + "loss": 0.7616, + "step": 5394 + }, + { + "epoch": 0.07966664831029609, + "grad_norm": 1.4407910108566284, + "learning_rate": 4.780153554146274e-05, + "loss": 0.7618, + "step": 5425 + }, + { + "epoch": 0.08012188630064064, + "grad_norm": 1.2349289655685425, + "learning_rate": 4.7766969656063766e-05, + "loss": 0.7688, + "step": 5456 + }, + { + "epoch": 0.08057712429098518, + "grad_norm": 1.3858132362365723, + "learning_rate": 4.773214684850662e-05, + "loss": 0.7517, + "step": 5487 + }, + { + "epoch": 0.08103236228132973, + "grad_norm": 1.2951774597167969, + "learning_rate": 4.769706751176193e-05, + "loss": 0.7588, + "step": 5518 + }, + { + "epoch": 0.08148760027167429, + "grad_norm": 1.2295372486114502, + "learning_rate": 4.7661732041695264e-05, + "loss": 0.7522, + "step": 5549 + }, + { + "epoch": 0.08194283826201883, + "grad_norm": 1.4258685111999512, + "learning_rate": 4.762614083706258e-05, + "loss": 0.7666, + "step": 5580 + }, + { + "epoch": 0.08239807625236338, + "grad_norm": 1.5323545932769775, + "learning_rate": 4.759029429950581e-05, + "loss": 0.7587, + "step": 5611 + }, + { + "epoch": 0.08285331424270793, + "grad_norm": 1.3712198734283447, + "learning_rate": 4.7554192833548235e-05, + "loss": 0.7597, + "step": 5642 + }, + { + "epoch": 0.08330855223305247, + "grad_norm": 1.389145016670227, + "learning_rate": 4.751783684659e-05, + "loss": 0.7484, + "step": 5673 + }, + { + "epoch": 0.08376379022339703, + "grad_norm": 1.466799020767212, + "learning_rate": 4.748122674890348e-05, + "loss": 0.7586, + "step": 5704 + }, + { + "epoch": 0.08421902821374158, + "grad_norm": 1.4522569179534912, + "learning_rate": 4.7444362953628654e-05, + "loss": 0.7615, + "step": 5735 + }, + { + "epoch": 0.08467426620408613, + "grad_norm": 1.6374691724777222, + "learning_rate": 4.7407245876768424e-05, + "loss": 0.7573, + "step": 5766 + }, + { + "epoch": 0.08512950419443067, + "grad_norm": 1.384748101234436, + "learning_rate": 4.736987593718397e-05, + "loss": 0.7516, + "step": 5797 + }, + { + "epoch": 0.08558474218477523, + "grad_norm": 1.366335153579712, + "learning_rate": 4.733225355658999e-05, + "loss": 0.7681, + "step": 5828 + }, + { + "epoch": 0.08603998017511978, + "grad_norm": 1.3855708837509155, + "learning_rate": 4.7294379159549926e-05, + "loss": 0.7557, + "step": 5859 + }, + { + "epoch": 0.08649521816546432, + "grad_norm": 1.3614453077316284, + "learning_rate": 4.725625317347119e-05, + "loss": 0.7624, + "step": 5890 + }, + { + "epoch": 0.08695045615580888, + "grad_norm": 1.2072994709014893, + "learning_rate": 4.7217876028600374e-05, + "loss": 0.7545, + "step": 5921 + }, + { + "epoch": 0.08740569414615343, + "grad_norm": 1.5530472993850708, + "learning_rate": 4.717924815801832e-05, + "loss": 0.7574, + "step": 5952 + }, + { + "epoch": 0.08786093213649797, + "grad_norm": 1.3522703647613525, + "learning_rate": 4.714036999763532e-05, + "loss": 0.7608, + "step": 5983 + }, + { + "epoch": 0.08831617012684252, + "grad_norm": 1.342637300491333, + "learning_rate": 4.7101241986186116e-05, + "loss": 0.7519, + "step": 6014 + }, + { + "epoch": 0.08877140811718708, + "grad_norm": 1.2783684730529785, + "learning_rate": 4.7061864565225e-05, + "loss": 0.7565, + "step": 6045 + }, + { + "epoch": 0.08922664610753162, + "grad_norm": 1.1245291233062744, + "learning_rate": 4.702223817912081e-05, + "loss": 0.7655, + "step": 6076 + }, + { + "epoch": 0.08968188409787617, + "grad_norm": 1.2777146100997925, + "learning_rate": 4.698236327505195e-05, + "loss": 0.7526, + "step": 6107 + }, + { + "epoch": 0.09013712208822072, + "grad_norm": 1.2739795446395874, + "learning_rate": 4.694224030300127e-05, + "loss": 0.7594, + "step": 6138 + }, + { + "epoch": 0.09059236007856526, + "grad_norm": 1.358684778213501, + "learning_rate": 4.690186971575107e-05, + "loss": 0.7599, + "step": 6169 + }, + { + "epoch": 0.09104759806890982, + "grad_norm": 1.2908906936645508, + "learning_rate": 4.6861251968877916e-05, + "loss": 0.7564, + "step": 6200 + }, + { + "epoch": 0.09150283605925437, + "grad_norm": 1.2198729515075684, + "learning_rate": 4.68203875207476e-05, + "loss": 0.7595, + "step": 6231 + }, + { + "epoch": 0.09195807404959891, + "grad_norm": 1.337134599685669, + "learning_rate": 4.677927683250983e-05, + "loss": 0.7512, + "step": 6262 + }, + { + "epoch": 0.09241331203994346, + "grad_norm": 1.4034923315048218, + "learning_rate": 4.6737920368093156e-05, + "loss": 0.7587, + "step": 6293 + }, + { + "epoch": 0.09286855003028802, + "grad_norm": 1.2584336996078491, + "learning_rate": 4.669631859419965e-05, + "loss": 0.7532, + "step": 6324 + }, + { + "epoch": 0.09332378802063256, + "grad_norm": 1.4327120780944824, + "learning_rate": 4.6654471980299676e-05, + "loss": 0.7528, + "step": 6355 + }, + { + "epoch": 0.09377902601097711, + "grad_norm": 1.386634111404419, + "learning_rate": 4.661238099862658e-05, + "loss": 0.7455, + "step": 6386 + }, + { + "epoch": 0.09423426400132166, + "grad_norm": 1.373677372932434, + "learning_rate": 4.657004612417138e-05, + "loss": 0.7519, + "step": 6417 + }, + { + "epoch": 0.0946895019916662, + "grad_norm": 1.3846200704574585, + "learning_rate": 4.6527467834677374e-05, + "loss": 0.7505, + "step": 6448 + }, + { + "epoch": 0.09514473998201076, + "grad_norm": 1.3237924575805664, + "learning_rate": 4.648464661063478e-05, + "loss": 0.7562, + "step": 6479 + }, + { + "epoch": 0.09559997797235531, + "grad_norm": 1.3368539810180664, + "learning_rate": 4.6441582935275264e-05, + "loss": 0.7523, + "step": 6510 + }, + { + "epoch": 0.09605521596269985, + "grad_norm": 1.1844121217727661, + "learning_rate": 4.6398277294566586e-05, + "loss": 0.7535, + "step": 6541 + }, + { + "epoch": 0.0965104539530444, + "grad_norm": 1.3107521533966064, + "learning_rate": 4.6354730177207e-05, + "loss": 0.7609, + "step": 6572 + }, + { + "epoch": 0.09696569194338896, + "grad_norm": 1.3678165674209595, + "learning_rate": 4.6310942074619787e-05, + "loss": 0.7513, + "step": 6603 + }, + { + "epoch": 0.0974209299337335, + "grad_norm": 1.2169839143753052, + "learning_rate": 4.626691348094777e-05, + "loss": 0.7464, + "step": 6634 + }, + { + "epoch": 0.09787616792407805, + "grad_norm": 1.5456466674804688, + "learning_rate": 4.622264489304762e-05, + "loss": 0.7431, + "step": 6665 + }, + { + "epoch": 0.0983314059144226, + "grad_norm": 1.3606510162353516, + "learning_rate": 4.617813681048434e-05, + "loss": 0.7481, + "step": 6696 + }, + { + "epoch": 0.09878664390476714, + "grad_norm": 1.1854835748672485, + "learning_rate": 4.61333897355256e-05, + "loss": 0.7491, + "step": 6727 + }, + { + "epoch": 0.0992418818951117, + "grad_norm": 1.1682158708572388, + "learning_rate": 4.608840417313604e-05, + "loss": 0.7433, + "step": 6758 + }, + { + "epoch": 0.09969711988545625, + "grad_norm": 1.3889473676681519, + "learning_rate": 4.6043180630971646e-05, + "loss": 0.7414, + "step": 6789 + }, + { + "epoch": 0.10015235787580079, + "grad_norm": 1.351807713508606, + "learning_rate": 4.599771961937391e-05, + "loss": 0.7501, + "step": 6820 + }, + { + "epoch": 0.10060759586614534, + "grad_norm": 1.4737526178359985, + "learning_rate": 4.5952021651364204e-05, + "loss": 0.7508, + "step": 6851 + }, + { + "epoch": 0.1010628338564899, + "grad_norm": 1.3223670721054077, + "learning_rate": 4.590608724263786e-05, + "loss": 0.7557, + "step": 6882 + }, + { + "epoch": 0.10151807184683444, + "grad_norm": 1.182531714439392, + "learning_rate": 4.585991691155845e-05, + "loss": 0.7595, + "step": 6913 + }, + { + "epoch": 0.10197330983717899, + "grad_norm": 1.3309706449508667, + "learning_rate": 4.581351117915188e-05, + "loss": 0.7516, + "step": 6944 + }, + { + "epoch": 0.10242854782752354, + "grad_norm": 1.1631003618240356, + "learning_rate": 4.5766870569100534e-05, + "loss": 0.7538, + "step": 6975 + }, + { + "epoch": 0.10288378581786808, + "grad_norm": 1.150242805480957, + "learning_rate": 4.571999560773736e-05, + "loss": 0.7583, + "step": 7006 + }, + { + "epoch": 0.10333902380821264, + "grad_norm": 1.2031753063201904, + "learning_rate": 4.5672886824039915e-05, + "loss": 0.743, + "step": 7037 + }, + { + "epoch": 0.10379426179855719, + "grad_norm": 1.3973835706710815, + "learning_rate": 4.5625544749624435e-05, + "loss": 0.7477, + "step": 7068 + }, + { + "epoch": 0.10424949978890174, + "grad_norm": 1.3755607604980469, + "learning_rate": 4.5577969918739794e-05, + "loss": 0.7424, + "step": 7099 + }, + { + "epoch": 0.10470473777924628, + "grad_norm": 1.2959696054458618, + "learning_rate": 4.5530162868261486e-05, + "loss": 0.7432, + "step": 7130 + }, + { + "epoch": 0.10515997576959084, + "grad_norm": 1.4843213558197021, + "learning_rate": 4.548212413768558e-05, + "loss": 0.7519, + "step": 7161 + }, + { + "epoch": 0.10561521375993539, + "grad_norm": 1.3697874546051025, + "learning_rate": 4.543385426912261e-05, + "loss": 0.7546, + "step": 7192 + }, + { + "epoch": 0.10607045175027993, + "grad_norm": 1.3163423538208008, + "learning_rate": 4.53853538072915e-05, + "loss": 0.7328, + "step": 7223 + }, + { + "epoch": 0.10652568974062449, + "grad_norm": 1.200897455215454, + "learning_rate": 4.533662329951336e-05, + "loss": 0.7438, + "step": 7254 + }, + { + "epoch": 0.10698092773096904, + "grad_norm": 1.3377361297607422, + "learning_rate": 4.528766329570536e-05, + "loss": 0.7478, + "step": 7285 + }, + { + "epoch": 0.10743616572131358, + "grad_norm": 1.2304787635803223, + "learning_rate": 4.523847434837447e-05, + "loss": 0.7468, + "step": 7316 + }, + { + "epoch": 0.10789140371165813, + "grad_norm": 1.1947201490402222, + "learning_rate": 4.518905701261128e-05, + "loss": 0.7483, + "step": 7347 + }, + { + "epoch": 0.10834664170200269, + "grad_norm": 1.1942620277404785, + "learning_rate": 4.5139411846083715e-05, + "loss": 0.7522, + "step": 7378 + }, + { + "epoch": 0.10880187969234723, + "grad_norm": 1.2765998840332031, + "learning_rate": 4.508953940903073e-05, + "loss": 0.7425, + "step": 7409 + }, + { + "epoch": 0.10925711768269178, + "grad_norm": 1.1460707187652588, + "learning_rate": 4.5039440264255994e-05, + "loss": 0.7534, + "step": 7440 + }, + { + "epoch": 0.10971235567303633, + "grad_norm": 1.1739740371704102, + "learning_rate": 4.498911497712155e-05, + "loss": 0.7447, + "step": 7471 + }, + { + "epoch": 0.11016759366338087, + "grad_norm": 1.227232575416565, + "learning_rate": 4.493856411554142e-05, + "loss": 0.7452, + "step": 7502 + }, + { + "epoch": 0.11062283165372543, + "grad_norm": 1.3812710046768188, + "learning_rate": 4.4887788249975206e-05, + "loss": 0.7431, + "step": 7533 + }, + { + "epoch": 0.11107806964406998, + "grad_norm": 1.1743565797805786, + "learning_rate": 4.4836787953421656e-05, + "loss": 0.7421, + "step": 7564 + }, + { + "epoch": 0.11153330763441452, + "grad_norm": 1.2436307668685913, + "learning_rate": 4.478556380141218e-05, + "loss": 0.7461, + "step": 7595 + }, + { + "epoch": 0.11198854562475907, + "grad_norm": 1.2472264766693115, + "learning_rate": 4.4734116372004375e-05, + "loss": 0.7456, + "step": 7626 + }, + { + "epoch": 0.11244378361510363, + "grad_norm": 1.2209473848342896, + "learning_rate": 4.4682446245775477e-05, + "loss": 0.7544, + "step": 7657 + }, + { + "epoch": 0.11289902160544817, + "grad_norm": 1.3582866191864014, + "learning_rate": 4.463055400581586e-05, + "loss": 0.7464, + "step": 7688 + }, + { + "epoch": 0.11335425959579272, + "grad_norm": 1.2489479780197144, + "learning_rate": 4.4578440237722374e-05, + "loss": 0.7394, + "step": 7719 + }, + { + "epoch": 0.11380949758613727, + "grad_norm": 1.2384752035140991, + "learning_rate": 4.452610552959183e-05, + "loss": 0.7358, + "step": 7750 + }, + { + "epoch": 0.11426473557648181, + "grad_norm": 1.3618046045303345, + "learning_rate": 4.447355047201428e-05, + "loss": 0.742, + "step": 7781 + }, + { + "epoch": 0.11471997356682637, + "grad_norm": 1.2598398923873901, + "learning_rate": 4.4420775658066414e-05, + "loss": 0.7407, + "step": 7812 + }, + { + "epoch": 0.11517521155717092, + "grad_norm": 1.1453444957733154, + "learning_rate": 4.436778168330484e-05, + "loss": 0.7499, + "step": 7843 + }, + { + "epoch": 0.11563044954751546, + "grad_norm": 1.3463783264160156, + "learning_rate": 4.4314569145759353e-05, + "loss": 0.7388, + "step": 7874 + }, + { + "epoch": 0.11608568753786001, + "grad_norm": 1.2431267499923706, + "learning_rate": 4.42611386459262e-05, + "loss": 0.7487, + "step": 7905 + }, + { + "epoch": 0.11654092552820457, + "grad_norm": 1.2726207971572876, + "learning_rate": 4.420749078676133e-05, + "loss": 0.7468, + "step": 7936 + }, + { + "epoch": 0.1169961635185491, + "grad_norm": 1.1979873180389404, + "learning_rate": 4.4153626173673516e-05, + "loss": 0.7482, + "step": 7967 + }, + { + "epoch": 0.11745140150889366, + "grad_norm": 1.2321964502334595, + "learning_rate": 4.409954541451762e-05, + "loss": 0.7363, + "step": 7998 + }, + { + "epoch": 0.11790663949923821, + "grad_norm": 1.265199065208435, + "learning_rate": 4.404524911958764e-05, + "loss": 0.7466, + "step": 8029 + }, + { + "epoch": 0.11836187748958275, + "grad_norm": 1.3418115377426147, + "learning_rate": 4.399073790160989e-05, + "loss": 0.7433, + "step": 8060 + }, + { + "epoch": 0.11881711547992731, + "grad_norm": 1.276666522026062, + "learning_rate": 4.393601237573607e-05, + "loss": 0.7457, + "step": 8091 + }, + { + "epoch": 0.11927235347027186, + "grad_norm": 1.3559373617172241, + "learning_rate": 4.388107315953628e-05, + "loss": 0.7451, + "step": 8122 + }, + { + "epoch": 0.1197275914606164, + "grad_norm": 1.3017044067382812, + "learning_rate": 4.382592087299212e-05, + "loss": 0.736, + "step": 8153 + }, + { + "epoch": 0.12018282945096095, + "grad_norm": 1.230873703956604, + "learning_rate": 4.377055613848964e-05, + "loss": 0.7409, + "step": 8184 + }, + { + "epoch": 0.12063806744130551, + "grad_norm": 1.3415979146957397, + "learning_rate": 4.3714979580812355e-05, + "loss": 0.7456, + "step": 8215 + }, + { + "epoch": 0.12109330543165005, + "grad_norm": 1.3706104755401611, + "learning_rate": 4.365919182713416e-05, + "loss": 0.7425, + "step": 8246 + }, + { + "epoch": 0.1215485434219946, + "grad_norm": 1.3071256875991821, + "learning_rate": 4.360319350701226e-05, + "loss": 0.7312, + "step": 8277 + }, + { + "epoch": 0.12200378141233915, + "grad_norm": 1.1925092935562134, + "learning_rate": 4.3546985252380115e-05, + "loss": 0.7457, + "step": 8308 + }, + { + "epoch": 0.12245901940268371, + "grad_norm": 1.2254852056503296, + "learning_rate": 4.349056769754021e-05, + "loss": 0.7454, + "step": 8339 + }, + { + "epoch": 0.12291425739302825, + "grad_norm": 1.2121927738189697, + "learning_rate": 4.3433941479156994e-05, + "loss": 0.7451, + "step": 8370 + }, + { + "epoch": 0.1233694953833728, + "grad_norm": 1.4518349170684814, + "learning_rate": 4.3377107236249647e-05, + "loss": 0.7376, + "step": 8401 + }, + { + "epoch": 0.12382473337371736, + "grad_norm": 1.2487318515777588, + "learning_rate": 4.332006561018488e-05, + "loss": 0.7468, + "step": 8432 + }, + { + "epoch": 0.1242799713640619, + "grad_norm": 1.2601664066314697, + "learning_rate": 4.3262817244669683e-05, + "loss": 0.7341, + "step": 8463 + }, + { + "epoch": 0.12473520935440645, + "grad_norm": 2.2060904502868652, + "learning_rate": 4.3205362785744083e-05, + "loss": 0.7421, + "step": 8494 + }, + { + "epoch": 0.125190447344751, + "grad_norm": 1.2388501167297363, + "learning_rate": 4.314770288177384e-05, + "loss": 0.7372, + "step": 8525 + }, + { + "epoch": 0.12564568533509554, + "grad_norm": 1.20314359664917, + "learning_rate": 4.308983818344313e-05, + "loss": 0.7308, + "step": 8556 + }, + { + "epoch": 0.1261009233254401, + "grad_norm": 1.2387797832489014, + "learning_rate": 4.3031769343747206e-05, + "loss": 0.7364, + "step": 8587 + }, + { + "epoch": 0.12655616131578465, + "grad_norm": 1.2150596380233765, + "learning_rate": 4.297349701798505e-05, + "loss": 0.7339, + "step": 8618 + }, + { + "epoch": 0.1270113993061292, + "grad_norm": 1.3160388469696045, + "learning_rate": 4.2915021863751916e-05, + "loss": 0.7448, + "step": 8649 + }, + { + "epoch": 0.12746663729647373, + "grad_norm": 1.2065149545669556, + "learning_rate": 4.285634454093198e-05, + "loss": 0.7373, + "step": 8680 + }, + { + "epoch": 0.12792187528681828, + "grad_norm": 1.2958154678344727, + "learning_rate": 4.279746571169086e-05, + "loss": 0.7282, + "step": 8711 + }, + { + "epoch": 0.12837711327716284, + "grad_norm": 1.2723522186279297, + "learning_rate": 4.2738386040468136e-05, + "loss": 0.7274, + "step": 8742 + }, + { + "epoch": 0.1288323512675074, + "grad_norm": 1.1709808111190796, + "learning_rate": 4.2679106193969866e-05, + "loss": 0.7459, + "step": 8773 + }, + { + "epoch": 0.12928758925785194, + "grad_norm": 1.3450511693954468, + "learning_rate": 4.261962684116106e-05, + "loss": 0.7334, + "step": 8804 + }, + { + "epoch": 0.1297428272481965, + "grad_norm": 1.3592984676361084, + "learning_rate": 4.2559948653258145e-05, + "loss": 0.7415, + "step": 8835 + }, + { + "epoch": 0.13019806523854102, + "grad_norm": 1.2315089702606201, + "learning_rate": 4.250007230372134e-05, + "loss": 0.7296, + "step": 8866 + }, + { + "epoch": 0.13065330322888558, + "grad_norm": 1.0693110227584839, + "learning_rate": 4.2439998468247126e-05, + "loss": 0.7269, + "step": 8897 + }, + { + "epoch": 0.13110854121923013, + "grad_norm": 1.0341922044754028, + "learning_rate": 4.2379727824760566e-05, + "loss": 0.7257, + "step": 8928 + }, + { + "epoch": 0.13156377920957468, + "grad_norm": 1.3500678539276123, + "learning_rate": 4.231926105340768e-05, + "loss": 0.7379, + "step": 8959 + }, + { + "epoch": 0.13201901719991924, + "grad_norm": 1.3482787609100342, + "learning_rate": 4.225859883654776e-05, + "loss": 0.7393, + "step": 8990 + }, + { + "epoch": 0.1324742551902638, + "grad_norm": 1.3126060962677002, + "learning_rate": 4.219774185874569e-05, + "loss": 0.7391, + "step": 9021 + }, + { + "epoch": 0.13292949318060834, + "grad_norm": 1.2438606023788452, + "learning_rate": 4.213669080676418e-05, + "loss": 0.7347, + "step": 9052 + }, + { + "epoch": 0.13338473117095287, + "grad_norm": 1.3756070137023926, + "learning_rate": 4.2075446369556056e-05, + "loss": 0.7321, + "step": 9083 + }, + { + "epoch": 0.13383996916129742, + "grad_norm": 1.189840316772461, + "learning_rate": 4.201400923825648e-05, + "loss": 0.7352, + "step": 9114 + }, + { + "epoch": 0.13429520715164198, + "grad_norm": 1.1982426643371582, + "learning_rate": 4.195238010617511e-05, + "loss": 0.7388, + "step": 9145 + }, + { + "epoch": 0.13475044514198653, + "grad_norm": 1.226491928100586, + "learning_rate": 4.1890559668788344e-05, + "loss": 0.7349, + "step": 9176 + }, + { + "epoch": 0.13520568313233108, + "grad_norm": 1.3030654191970825, + "learning_rate": 4.1828548623731405e-05, + "loss": 0.7371, + "step": 9207 + }, + { + "epoch": 0.13566092112267564, + "grad_norm": 1.17802894115448, + "learning_rate": 4.1766347670790506e-05, + "loss": 0.7431, + "step": 9238 + }, + { + "epoch": 0.13611615911302016, + "grad_norm": 1.2893658876419067, + "learning_rate": 4.170395751189495e-05, + "loss": 0.7258, + "step": 9269 + }, + { + "epoch": 0.13657139710336472, + "grad_norm": 1.193758487701416, + "learning_rate": 4.164137885110921e-05, + "loss": 0.7362, + "step": 9300 + }, + { + "epoch": 0.13702663509370927, + "grad_norm": 1.2750978469848633, + "learning_rate": 4.157861239462495e-05, + "loss": 0.7265, + "step": 9331 + }, + { + "epoch": 0.13748187308405382, + "grad_norm": 1.161464810371399, + "learning_rate": 4.1515658850753114e-05, + "loss": 0.7389, + "step": 9362 + }, + { + "epoch": 0.13793711107439838, + "grad_norm": 1.2749390602111816, + "learning_rate": 4.145251892991588e-05, + "loss": 0.7315, + "step": 9393 + }, + { + "epoch": 0.13839234906474293, + "grad_norm": 1.1729388236999512, + "learning_rate": 4.138919334463868e-05, + "loss": 0.7352, + "step": 9424 + }, + { + "epoch": 0.13884758705508746, + "grad_norm": 1.0987122058868408, + "learning_rate": 4.1325682809542124e-05, + "loss": 0.7347, + "step": 9455 + }, + { + "epoch": 0.139302825045432, + "grad_norm": 1.0855766534805298, + "learning_rate": 4.126198804133398e-05, + "loss": 0.7271, + "step": 9486 + }, + { + "epoch": 0.13975806303577656, + "grad_norm": 1.1782172918319702, + "learning_rate": 4.1198109758801055e-05, + "loss": 0.74, + "step": 9517 + }, + { + "epoch": 0.14021330102612112, + "grad_norm": 1.186962366104126, + "learning_rate": 4.113404868280107e-05, + "loss": 0.725, + "step": 9548 + }, + { + "epoch": 0.14066853901646567, + "grad_norm": 1.2993977069854736, + "learning_rate": 4.106980553625457e-05, + "loss": 0.7346, + "step": 9579 + }, + { + "epoch": 0.14112377700681022, + "grad_norm": 1.154897928237915, + "learning_rate": 4.100538104413674e-05, + "loss": 0.7348, + "step": 9610 + }, + { + "epoch": 0.14157901499715475, + "grad_norm": 1.203722357749939, + "learning_rate": 4.09407759334692e-05, + "loss": 0.7312, + "step": 9641 + }, + { + "epoch": 0.1420342529874993, + "grad_norm": 1.102858543395996, + "learning_rate": 4.087599093331186e-05, + "loss": 0.7227, + "step": 9672 + }, + { + "epoch": 0.14248949097784386, + "grad_norm": 1.2667406797409058, + "learning_rate": 4.081102677475462e-05, + "loss": 0.727, + "step": 9703 + }, + { + "epoch": 0.1429447289681884, + "grad_norm": 1.221291422843933, + "learning_rate": 4.0745884190909194e-05, + "loss": 0.7317, + "step": 9734 + }, + { + "epoch": 0.14339996695853297, + "grad_norm": 1.0426162481307983, + "learning_rate": 4.0680563916900796e-05, + "loss": 0.7299, + "step": 9765 + }, + { + "epoch": 0.14385520494887752, + "grad_norm": 1.2158912420272827, + "learning_rate": 4.0615066689859815e-05, + "loss": 0.7399, + "step": 9796 + }, + { + "epoch": 0.14431044293922204, + "grad_norm": 1.2689307928085327, + "learning_rate": 4.0549393248913584e-05, + "loss": 0.721, + "step": 9827 + }, + { + "epoch": 0.1447656809295666, + "grad_norm": 1.3168833255767822, + "learning_rate": 4.048354433517794e-05, + "loss": 0.7258, + "step": 9858 + }, + { + "epoch": 0.14522091891991115, + "grad_norm": 1.1966122388839722, + "learning_rate": 4.0417520691748916e-05, + "loss": 0.7375, + "step": 9889 + }, + { + "epoch": 0.1456761569102557, + "grad_norm": 1.1252835988998413, + "learning_rate": 4.035132306369438e-05, + "loss": 0.7254, + "step": 9920 + }, + { + "epoch": 0.14613139490060026, + "grad_norm": 1.186324119567871, + "learning_rate": 4.028495219804555e-05, + "loss": 0.7337, + "step": 9951 + }, + { + "epoch": 0.1465866328909448, + "grad_norm": 1.1606496572494507, + "learning_rate": 4.021840884378864e-05, + "loss": 0.7273, + "step": 9982 + }, + { + "epoch": 0.14704187088128934, + "grad_norm": 1.316298007965088, + "learning_rate": 4.015169375185633e-05, + "loss": 0.7276, + "step": 10013 + }, + { + "epoch": 0.1474971088716339, + "grad_norm": 1.224236249923706, + "learning_rate": 4.0084807675119396e-05, + "loss": 0.7328, + "step": 10044 + }, + { + "epoch": 0.14795234686197845, + "grad_norm": 1.227148175239563, + "learning_rate": 4.0017751368378106e-05, + "loss": 0.739, + "step": 10075 + }, + { + "epoch": 0.148407584852323, + "grad_norm": 1.1307867765426636, + "learning_rate": 3.995052558835377e-05, + "loss": 0.7324, + "step": 10106 + }, + { + "epoch": 0.14886282284266755, + "grad_norm": 1.2655375003814697, + "learning_rate": 3.988313109368017e-05, + "loss": 0.7263, + "step": 10137 + }, + { + "epoch": 0.1493180608330121, + "grad_norm": 1.1724634170532227, + "learning_rate": 3.981556864489504e-05, + "loss": 0.7289, + "step": 10168 + }, + { + "epoch": 0.14977329882335666, + "grad_norm": 1.140684723854065, + "learning_rate": 3.974783900443142e-05, + "loss": 0.7309, + "step": 10199 + }, + { + "epoch": 0.15022853681370119, + "grad_norm": 1.167183756828308, + "learning_rate": 3.9679942936609095e-05, + "loss": 0.729, + "step": 10230 + }, + { + "epoch": 0.15068377480404574, + "grad_norm": 1.2216546535491943, + "learning_rate": 3.961188120762596e-05, + "loss": 0.7288, + "step": 10261 + }, + { + "epoch": 0.1511390127943903, + "grad_norm": 1.227397084236145, + "learning_rate": 3.954365458554938e-05, + "loss": 0.7313, + "step": 10292 + }, + { + "epoch": 0.15159425078473485, + "grad_norm": 1.075441837310791, + "learning_rate": 3.947526384030751e-05, + "loss": 0.7306, + "step": 10323 + }, + { + "epoch": 0.1520494887750794, + "grad_norm": 1.1227167844772339, + "learning_rate": 3.9406709743680624e-05, + "loss": 0.7287, + "step": 10354 + }, + { + "epoch": 0.15250472676542395, + "grad_norm": 1.0665740966796875, + "learning_rate": 3.9337993069292366e-05, + "loss": 0.7246, + "step": 10385 + }, + { + "epoch": 0.15295996475576848, + "grad_norm": 1.250580072402954, + "learning_rate": 3.926911459260109e-05, + "loss": 0.7183, + "step": 10416 + }, + { + "epoch": 0.15341520274611303, + "grad_norm": 1.1809351444244385, + "learning_rate": 3.920007509089102e-05, + "loss": 0.7307, + "step": 10447 + }, + { + "epoch": 0.1538704407364576, + "grad_norm": 1.3934037685394287, + "learning_rate": 3.913087534326357e-05, + "loss": 0.7151, + "step": 10478 + }, + { + "epoch": 0.15432567872680214, + "grad_norm": 1.1272441148757935, + "learning_rate": 3.9061516130628475e-05, + "loss": 0.7309, + "step": 10509 + }, + { + "epoch": 0.1547809167171467, + "grad_norm": 1.2410191297531128, + "learning_rate": 3.8991998235695025e-05, + "loss": 0.7375, + "step": 10540 + }, + { + "epoch": 0.15523615470749125, + "grad_norm": 1.1426646709442139, + "learning_rate": 3.8922322442963224e-05, + "loss": 0.7342, + "step": 10571 + }, + { + "epoch": 0.15569139269783577, + "grad_norm": 1.2037526369094849, + "learning_rate": 3.885248953871491e-05, + "loss": 0.7329, + "step": 10602 + }, + { + "epoch": 0.15614663068818033, + "grad_norm": 1.2198660373687744, + "learning_rate": 3.8782500311004915e-05, + "loss": 0.7367, + "step": 10633 + }, + { + "epoch": 0.15660186867852488, + "grad_norm": 1.3129019737243652, + "learning_rate": 3.871235554965218e-05, + "loss": 0.7236, + "step": 10664 + }, + { + "epoch": 0.15705710666886943, + "grad_norm": 1.2182697057724, + "learning_rate": 3.864205604623078e-05, + "loss": 0.721, + "step": 10695 + }, + { + "epoch": 0.157512344659214, + "grad_norm": 1.1494821310043335, + "learning_rate": 3.857160259406107e-05, + "loss": 0.7223, + "step": 10726 + }, + { + "epoch": 0.15796758264955854, + "grad_norm": 1.0680466890335083, + "learning_rate": 3.8500995988200674e-05, + "loss": 0.7224, + "step": 10757 + }, + { + "epoch": 0.15842282063990307, + "grad_norm": 1.155052661895752, + "learning_rate": 3.843023702543556e-05, + "loss": 0.7268, + "step": 10788 + }, + { + "epoch": 0.15887805863024762, + "grad_norm": 1.1415716409683228, + "learning_rate": 3.8359326504270984e-05, + "loss": 0.7167, + "step": 10819 + }, + { + "epoch": 0.15933329662059217, + "grad_norm": 1.1486759185791016, + "learning_rate": 3.828826522492255e-05, + "loss": 0.7236, + "step": 10850 + }, + { + "epoch": 0.15978853461093673, + "grad_norm": 1.1916829347610474, + "learning_rate": 3.821705398930713e-05, + "loss": 0.7269, + "step": 10881 + }, + { + "epoch": 0.16024377260128128, + "grad_norm": 1.180760383605957, + "learning_rate": 3.814569360103385e-05, + "loss": 0.724, + "step": 10912 + }, + { + "epoch": 0.16069901059162583, + "grad_norm": 1.197651982307434, + "learning_rate": 3.807418486539499e-05, + "loss": 0.7265, + "step": 10943 + }, + { + "epoch": 0.16115424858197036, + "grad_norm": 1.1579455137252808, + "learning_rate": 3.80025285893569e-05, + "loss": 0.7247, + "step": 10974 + }, + { + "epoch": 0.16160948657231491, + "grad_norm": 1.1810616254806519, + "learning_rate": 3.793072558155093e-05, + "loss": 0.7256, + "step": 11005 + }, + { + "epoch": 0.16206472456265947, + "grad_norm": 1.1510332822799683, + "learning_rate": 3.785877665226426e-05, + "loss": 0.722, + "step": 11036 + }, + { + "epoch": 0.16251996255300402, + "grad_norm": 1.0499261617660522, + "learning_rate": 3.778668261343079e-05, + "loss": 0.7255, + "step": 11067 + }, + { + "epoch": 0.16297520054334858, + "grad_norm": 1.1509817838668823, + "learning_rate": 3.771444427862192e-05, + "loss": 0.7147, + "step": 11098 + }, + { + "epoch": 0.16343043853369313, + "grad_norm": 1.1638833284378052, + "learning_rate": 3.7642062463037465e-05, + "loss": 0.7187, + "step": 11129 + }, + { + "epoch": 0.16388567652403765, + "grad_norm": 1.024095058441162, + "learning_rate": 3.7569537983496373e-05, + "loss": 0.7244, + "step": 11160 + }, + { + "epoch": 0.1643409145143822, + "grad_norm": 1.1326078176498413, + "learning_rate": 3.749687165842753e-05, + "loss": 0.7221, + "step": 11191 + }, + { + "epoch": 0.16479615250472676, + "grad_norm": 1.1863468885421753, + "learning_rate": 3.7424064307860536e-05, + "loss": 0.725, + "step": 11222 + }, + { + "epoch": 0.16525139049507132, + "grad_norm": 1.2948099374771118, + "learning_rate": 3.735111675341645e-05, + "loss": 0.7249, + "step": 11253 + }, + { + "epoch": 0.16570662848541587, + "grad_norm": 1.1384631395339966, + "learning_rate": 3.7278029818298524e-05, + "loss": 0.7203, + "step": 11284 + }, + { + "epoch": 0.16616186647576042, + "grad_norm": 1.1335748434066772, + "learning_rate": 3.720480432728287e-05, + "loss": 0.7183, + "step": 11315 + }, + { + "epoch": 0.16661710446610495, + "grad_norm": 1.2514820098876953, + "learning_rate": 3.71314411067092e-05, + "loss": 0.7216, + "step": 11346 + }, + { + "epoch": 0.1670723424564495, + "grad_norm": 1.128739833831787, + "learning_rate": 3.70579409844715e-05, + "loss": 0.7234, + "step": 11377 + }, + { + "epoch": 0.16752758044679406, + "grad_norm": 1.152342677116394, + "learning_rate": 3.698430479000865e-05, + "loss": 0.7304, + "step": 11408 + }, + { + "epoch": 0.1679828184371386, + "grad_norm": 1.222424864768982, + "learning_rate": 3.691053335429509e-05, + "loss": 0.7164, + "step": 11439 + }, + { + "epoch": 0.16843805642748316, + "grad_norm": 1.2406268119812012, + "learning_rate": 3.683662750983147e-05, + "loss": 0.7157, + "step": 11470 + }, + { + "epoch": 0.16889329441782772, + "grad_norm": 1.1266759634017944, + "learning_rate": 3.676258809063518e-05, + "loss": 0.7228, + "step": 11501 + }, + { + "epoch": 0.16934853240817227, + "grad_norm": 1.1239662170410156, + "learning_rate": 3.6688415932231004e-05, + "loss": 0.7266, + "step": 11532 + }, + { + "epoch": 0.1698037703985168, + "grad_norm": 1.0813047885894775, + "learning_rate": 3.661411187164166e-05, + "loss": 0.7278, + "step": 11563 + }, + { + "epoch": 0.17025900838886135, + "grad_norm": 56.909889221191406, + "learning_rate": 3.65396767473784e-05, + "loss": 0.7163, + "step": 11594 + }, + { + "epoch": 0.1707142463792059, + "grad_norm": 1.0749647617340088, + "learning_rate": 3.6465111399431465e-05, + "loss": 0.7182, + "step": 11625 + }, + { + "epoch": 0.17116948436955046, + "grad_norm": 1.0705510377883911, + "learning_rate": 3.6390416669260674e-05, + "loss": 0.726, + "step": 11656 + }, + { + "epoch": 0.171624722359895, + "grad_norm": 1.1083126068115234, + "learning_rate": 3.63155933997859e-05, + "loss": 0.7262, + "step": 11687 + }, + { + "epoch": 0.17207996035023956, + "grad_norm": 1.2302770614624023, + "learning_rate": 3.624064243537758e-05, + "loss": 0.7229, + "step": 11718 + }, + { + "epoch": 0.1725351983405841, + "grad_norm": 1.240893006324768, + "learning_rate": 3.616556462184716e-05, + "loss": 0.7228, + "step": 11749 + }, + { + "epoch": 0.17299043633092864, + "grad_norm": 1.1897794008255005, + "learning_rate": 3.609036080643755e-05, + "loss": 0.7178, + "step": 11780 + }, + { + "epoch": 0.1734456743212732, + "grad_norm": 1.244950294494629, + "learning_rate": 3.60150318378136e-05, + "loss": 0.7244, + "step": 11811 + }, + { + "epoch": 0.17390091231161775, + "grad_norm": 1.1689528226852417, + "learning_rate": 3.5939578566052465e-05, + "loss": 0.7181, + "step": 11842 + }, + { + "epoch": 0.1743561503019623, + "grad_norm": 1.1300387382507324, + "learning_rate": 3.586400184263408e-05, + "loss": 0.7176, + "step": 11873 + }, + { + "epoch": 0.17481138829230686, + "grad_norm": 1.0884469747543335, + "learning_rate": 3.578830252043148e-05, + "loss": 0.7182, + "step": 11904 + }, + { + "epoch": 0.17526662628265138, + "grad_norm": 1.119352102279663, + "learning_rate": 3.571248145370125e-05, + "loss": 0.7223, + "step": 11935 + }, + { + "epoch": 0.17572186427299594, + "grad_norm": 1.0979810953140259, + "learning_rate": 3.5636539498073794e-05, + "loss": 0.7233, + "step": 11966 + }, + { + "epoch": 0.1761771022633405, + "grad_norm": 1.138571858406067, + "learning_rate": 3.556047751054378e-05, + "loss": 0.7279, + "step": 11997 + }, + { + "epoch": 0.17663234025368504, + "grad_norm": 1.2011150121688843, + "learning_rate": 3.548429634946039e-05, + "loss": 0.717, + "step": 12028 + }, + { + "epoch": 0.1770875782440296, + "grad_norm": 1.119472861289978, + "learning_rate": 3.540799687451768e-05, + "loss": 0.7227, + "step": 12059 + }, + { + "epoch": 0.17754281623437415, + "grad_norm": 1.089117169380188, + "learning_rate": 3.533157994674485e-05, + "loss": 0.7132, + "step": 12090 + }, + { + "epoch": 0.17799805422471868, + "grad_norm": 1.0937649011611938, + "learning_rate": 3.5255046428496546e-05, + "loss": 0.7285, + "step": 12121 + }, + { + "epoch": 0.17845329221506323, + "grad_norm": 1.0359808206558228, + "learning_rate": 3.517839718344311e-05, + "loss": 0.7178, + "step": 12152 + }, + { + "epoch": 0.17890853020540778, + "grad_norm": 1.1309690475463867, + "learning_rate": 3.510163307656086e-05, + "loss": 0.713, + "step": 12183 + }, + { + "epoch": 0.17936376819575234, + "grad_norm": 1.126597285270691, + "learning_rate": 3.5024754974122324e-05, + "loss": 0.7139, + "step": 12214 + }, + { + "epoch": 0.1798190061860969, + "grad_norm": 1.29429292678833, + "learning_rate": 3.494776374368643e-05, + "loss": 0.7261, + "step": 12245 + }, + { + "epoch": 0.18027424417644144, + "grad_norm": 1.1156132221221924, + "learning_rate": 3.4870660254088724e-05, + "loss": 0.7124, + "step": 12276 + }, + { + "epoch": 0.18072948216678597, + "grad_norm": 1.1548200845718384, + "learning_rate": 3.479344537543164e-05, + "loss": 0.7195, + "step": 12307 + }, + { + "epoch": 0.18118472015713052, + "grad_norm": 1.1318944692611694, + "learning_rate": 3.4716119979074565e-05, + "loss": 0.7215, + "step": 12338 + }, + { + "epoch": 0.18163995814747508, + "grad_norm": 1.1952061653137207, + "learning_rate": 3.463868493762412e-05, + "loss": 0.7159, + "step": 12369 + }, + { + "epoch": 0.18209519613781963, + "grad_norm": 1.1703433990478516, + "learning_rate": 3.456114112492418e-05, + "loss": 0.7145, + "step": 12400 + }, + { + "epoch": 0.18255043412816419, + "grad_norm": 1.065319538116455, + "learning_rate": 3.4483489416046164e-05, + "loss": 0.7108, + "step": 12431 + }, + { + "epoch": 0.18300567211850874, + "grad_norm": 1.278533697128296, + "learning_rate": 3.440573068727905e-05, + "loss": 0.7116, + "step": 12462 + }, + { + "epoch": 0.18346091010885326, + "grad_norm": 1.1642391681671143, + "learning_rate": 3.4327865816119495e-05, + "loss": 0.716, + "step": 12493 + }, + { + "epoch": 0.18391614809919782, + "grad_norm": 1.2108798027038574, + "learning_rate": 3.4249895681262025e-05, + "loss": 0.7194, + "step": 12524 + }, + { + "epoch": 0.18437138608954237, + "grad_norm": 1.085163950920105, + "learning_rate": 3.417182116258899e-05, + "loss": 0.7166, + "step": 12555 + }, + { + "epoch": 0.18482662407988693, + "grad_norm": 1.124064564704895, + "learning_rate": 3.409364314116074e-05, + "loss": 0.7218, + "step": 12586 + }, + { + "epoch": 0.18528186207023148, + "grad_norm": 1.1817457675933838, + "learning_rate": 3.401536249920559e-05, + "loss": 0.7161, + "step": 12617 + }, + { + "epoch": 0.18573710006057603, + "grad_norm": 1.1430065631866455, + "learning_rate": 3.393698012010998e-05, + "loss": 0.7223, + "step": 12648 + }, + { + "epoch": 0.18619233805092056, + "grad_norm": 1.151633620262146, + "learning_rate": 3.385849688840839e-05, + "loss": 0.7181, + "step": 12679 + }, + { + "epoch": 0.1866475760412651, + "grad_norm": 1.1847879886627197, + "learning_rate": 3.3779913689773414e-05, + "loss": 0.7144, + "step": 12710 + }, + { + "epoch": 0.18710281403160967, + "grad_norm": 1.0997257232666016, + "learning_rate": 3.370123141100578e-05, + "loss": 0.7179, + "step": 12741 + }, + { + "epoch": 0.18755805202195422, + "grad_norm": 1.3207893371582031, + "learning_rate": 3.3622450940024305e-05, + "loss": 0.7147, + "step": 12772 + }, + { + "epoch": 0.18801329001229877, + "grad_norm": 1.1414912939071655, + "learning_rate": 3.35435731658559e-05, + "loss": 0.7258, + "step": 12803 + }, + { + "epoch": 0.18846852800264333, + "grad_norm": 1.1708139181137085, + "learning_rate": 3.346459897862552e-05, + "loss": 0.7222, + "step": 12834 + }, + { + "epoch": 0.18892376599298788, + "grad_norm": 1.0472559928894043, + "learning_rate": 3.338552926954613e-05, + "loss": 0.7168, + "step": 12865 + }, + { + "epoch": 0.1893790039833324, + "grad_norm": 1.1117762327194214, + "learning_rate": 3.330636493090868e-05, + "loss": 0.7128, + "step": 12896 + }, + { + "epoch": 0.18983424197367696, + "grad_norm": 1.1351251602172852, + "learning_rate": 3.322710685607193e-05, + "loss": 0.72, + "step": 12927 + }, + { + "epoch": 0.1902894799640215, + "grad_norm": 1.1530694961547852, + "learning_rate": 3.314775593945251e-05, + "loss": 0.7133, + "step": 12958 + }, + { + "epoch": 0.19074471795436607, + "grad_norm": 1.1590766906738281, + "learning_rate": 3.3068313076514714e-05, + "loss": 0.7236, + "step": 12989 + }, + { + "epoch": 0.19119995594471062, + "grad_norm": 1.0657463073730469, + "learning_rate": 3.298877916376047e-05, + "loss": 0.7159, + "step": 13020 + }, + { + "epoch": 0.19165519393505517, + "grad_norm": 1.090234637260437, + "learning_rate": 3.290915509871915e-05, + "loss": 0.7249, + "step": 13051 + }, + { + "epoch": 0.1921104319253997, + "grad_norm": 1.1001083850860596, + "learning_rate": 3.282944177993753e-05, + "loss": 0.721, + "step": 13082 + }, + { + "epoch": 0.19256566991574425, + "grad_norm": 1.0616897344589233, + "learning_rate": 3.274964010696957e-05, + "loss": 0.7145, + "step": 13113 + }, + { + "epoch": 0.1930209079060888, + "grad_norm": 1.05698823928833, + "learning_rate": 3.266975098036629e-05, + "loss": 0.7106, + "step": 13144 + }, + { + "epoch": 0.19347614589643336, + "grad_norm": 1.0747356414794922, + "learning_rate": 3.258977530166562e-05, + "loss": 0.7199, + "step": 13175 + }, + { + "epoch": 0.1939313838867779, + "grad_norm": 1.112629771232605, + "learning_rate": 3.250971397338227e-05, + "loss": 0.7088, + "step": 13206 + }, + { + "epoch": 0.19438662187712247, + "grad_norm": 1.13507878780365, + "learning_rate": 3.2429567898997404e-05, + "loss": 0.7084, + "step": 13237 + }, + { + "epoch": 0.194841859867467, + "grad_norm": 1.137793779373169, + "learning_rate": 3.234933798294859e-05, + "loss": 0.7121, + "step": 13268 + }, + { + "epoch": 0.19529709785781155, + "grad_norm": 1.263027548789978, + "learning_rate": 3.2269025130619535e-05, + "loss": 0.7117, + "step": 13299 + }, + { + "epoch": 0.1957523358481561, + "grad_norm": 1.1799430847167969, + "learning_rate": 3.218863024832985e-05, + "loss": 0.7144, + "step": 13330 + }, + { + "epoch": 0.19620757383850065, + "grad_norm": 1.040297508239746, + "learning_rate": 3.2108154243324864e-05, + "loss": 0.7089, + "step": 13361 + }, + { + "epoch": 0.1966628118288452, + "grad_norm": 1.110816240310669, + "learning_rate": 3.2027598023765345e-05, + "loss": 0.7166, + "step": 13392 + }, + { + "epoch": 0.19711804981918976, + "grad_norm": 1.0399518013000488, + "learning_rate": 3.194696249871729e-05, + "loss": 0.7081, + "step": 13423 + }, + { + "epoch": 0.1975732878095343, + "grad_norm": 1.1946238279342651, + "learning_rate": 3.186624857814164e-05, + "loss": 0.7112, + "step": 13454 + }, + { + "epoch": 0.19802852579987884, + "grad_norm": 1.1289162635803223, + "learning_rate": 3.178545717288401e-05, + "loss": 0.7079, + "step": 13485 + }, + { + "epoch": 0.1984837637902234, + "grad_norm": 1.17491614818573, + "learning_rate": 3.170458919466444e-05, + "loss": 0.7117, + "step": 13516 + }, + { + "epoch": 0.19893900178056795, + "grad_norm": 1.1498847007751465, + "learning_rate": 3.1623645556067063e-05, + "loss": 0.7131, + "step": 13547 + }, + { + "epoch": 0.1993942397709125, + "grad_norm": 1.2516326904296875, + "learning_rate": 3.154262717052985e-05, + "loss": 0.7145, + "step": 13578 + }, + { + "epoch": 0.19984947776125705, + "grad_norm": 1.2112692594528198, + "learning_rate": 3.146153495233426e-05, + "loss": 0.7098, + "step": 13609 + }, + { + "epoch": 0.20030471575160158, + "grad_norm": 0.989539384841919, + "learning_rate": 3.1380369816594944e-05, + "loss": 0.7182, + "step": 13640 + }, + { + "epoch": 0.20075995374194613, + "grad_norm": 1.0849697589874268, + "learning_rate": 3.129913267924946e-05, + "loss": 0.7153, + "step": 13671 + }, + { + "epoch": 0.2012151917322907, + "grad_norm": 1.1133675575256348, + "learning_rate": 3.121782445704782e-05, + "loss": 0.7104, + "step": 13702 + }, + { + "epoch": 0.20167042972263524, + "grad_norm": 1.1086009740829468, + "learning_rate": 3.11364460675423e-05, + "loss": 0.7125, + "step": 13733 + }, + { + "epoch": 0.2021256677129798, + "grad_norm": 1.0160905122756958, + "learning_rate": 3.1054998429076934e-05, + "loss": 0.7055, + "step": 13764 + }, + { + "epoch": 0.20258090570332435, + "grad_norm": 1.0811957120895386, + "learning_rate": 3.097348246077728e-05, + "loss": 0.7105, + "step": 13795 + }, + { + "epoch": 0.20303614369366887, + "grad_norm": 1.0012198686599731, + "learning_rate": 3.0891899082539924e-05, + "loss": 0.7134, + "step": 13826 + }, + { + "epoch": 0.20349138168401343, + "grad_norm": 1.2298318147659302, + "learning_rate": 3.0810249215022233e-05, + "loss": 0.7054, + "step": 13857 + }, + { + "epoch": 0.20394661967435798, + "grad_norm": 1.1324481964111328, + "learning_rate": 3.0728533779631865e-05, + "loss": 0.7023, + "step": 13888 + }, + { + "epoch": 0.20440185766470254, + "grad_norm": 1.1454377174377441, + "learning_rate": 3.064675369851637e-05, + "loss": 0.7124, + "step": 13919 + }, + { + "epoch": 0.2048570956550471, + "grad_norm": 1.0461289882659912, + "learning_rate": 3.056490989455289e-05, + "loss": 0.7159, + "step": 13950 + }, + { + "epoch": 0.20531233364539164, + "grad_norm": 1.0020164251327515, + "learning_rate": 3.0483003291337596e-05, + "loss": 0.702, + "step": 13981 + }, + { + "epoch": 0.20576757163573617, + "grad_norm": 1.0952621698379517, + "learning_rate": 3.040103481317539e-05, + "loss": 0.7108, + "step": 14012 + }, + { + "epoch": 0.20622280962608072, + "grad_norm": 1.129813551902771, + "learning_rate": 3.03190053850694e-05, + "loss": 0.7043, + "step": 14043 + }, + { + "epoch": 0.20667804761642528, + "grad_norm": 1.012137770652771, + "learning_rate": 3.0236915932710573e-05, + "loss": 0.7002, + "step": 14074 + }, + { + "epoch": 0.20713328560676983, + "grad_norm": 1.1774550676345825, + "learning_rate": 3.0154767382467232e-05, + "loss": 0.6996, + "step": 14105 + }, + { + "epoch": 0.20758852359711438, + "grad_norm": 1.0878807306289673, + "learning_rate": 3.0072560661374582e-05, + "loss": 0.7075, + "step": 14136 + }, + { + "epoch": 0.20804376158745894, + "grad_norm": 1.1599992513656616, + "learning_rate": 2.999029669712431e-05, + "loss": 0.7165, + "step": 14167 + }, + { + "epoch": 0.2084989995778035, + "grad_norm": 1.184403419494629, + "learning_rate": 2.990797641805408e-05, + "loss": 0.709, + "step": 14198 + }, + { + "epoch": 0.20895423756814802, + "grad_norm": 1.0857961177825928, + "learning_rate": 2.982560075313704e-05, + "loss": 0.6986, + "step": 14229 + }, + { + "epoch": 0.20940947555849257, + "grad_norm": 1.0922378301620483, + "learning_rate": 2.9743170631971368e-05, + "loss": 0.7085, + "step": 14260 + }, + { + "epoch": 0.20986471354883712, + "grad_norm": 1.0843058824539185, + "learning_rate": 2.9660686984769792e-05, + "loss": 0.7114, + "step": 14291 + }, + { + "epoch": 0.21031995153918168, + "grad_norm": 1.2273341417312622, + "learning_rate": 2.9578150742349047e-05, + "loss": 0.7092, + "step": 14322 + }, + { + "epoch": 0.21077518952952623, + "grad_norm": 1.1041830778121948, + "learning_rate": 2.949556283611942e-05, + "loss": 0.7086, + "step": 14353 + }, + { + "epoch": 0.21123042751987078, + "grad_norm": 1.1019948720932007, + "learning_rate": 2.9412924198074206e-05, + "loss": 0.7095, + "step": 14384 + }, + { + "epoch": 0.2116856655102153, + "grad_norm": 1.1595929861068726, + "learning_rate": 2.9330235760779208e-05, + "loss": 0.7083, + "step": 14415 + }, + { + "epoch": 0.21214090350055986, + "grad_norm": 1.252021312713623, + "learning_rate": 2.9247498457362188e-05, + "loss": 0.7045, + "step": 14446 + }, + { + "epoch": 0.21259614149090442, + "grad_norm": 1.1469565629959106, + "learning_rate": 2.9164713221502373e-05, + "loss": 0.6995, + "step": 14477 + }, + { + "epoch": 0.21305137948124897, + "grad_norm": 1.1796189546585083, + "learning_rate": 2.9081880987419912e-05, + "loss": 0.7101, + "step": 14508 + }, + { + "epoch": 0.21350661747159352, + "grad_norm": 1.183271884918213, + "learning_rate": 2.8999002689865296e-05, + "loss": 0.7074, + "step": 14539 + }, + { + "epoch": 0.21396185546193808, + "grad_norm": 1.2489229440689087, + "learning_rate": 2.8916079264108852e-05, + "loss": 0.7126, + "step": 14570 + }, + { + "epoch": 0.2144170934522826, + "grad_norm": 1.059584379196167, + "learning_rate": 2.883311164593017e-05, + "loss": 0.7045, + "step": 14601 + }, + { + "epoch": 0.21487233144262716, + "grad_norm": 1.1565430164337158, + "learning_rate": 2.875010077160754e-05, + "loss": 0.7075, + "step": 14632 + }, + { + "epoch": 0.2153275694329717, + "grad_norm": 1.0368664264678955, + "learning_rate": 2.866704757790741e-05, + "loss": 0.7065, + "step": 14663 + }, + { + "epoch": 0.21578280742331626, + "grad_norm": 1.1395366191864014, + "learning_rate": 2.858395300207376e-05, + "loss": 0.7029, + "step": 14694 + }, + { + "epoch": 0.21623804541366082, + "grad_norm": 1.077816128730774, + "learning_rate": 2.8500817981817607e-05, + "loss": 0.7078, + "step": 14725 + }, + { + "epoch": 0.21669328340400537, + "grad_norm": 1.2020843029022217, + "learning_rate": 2.8417643455306336e-05, + "loss": 0.7011, + "step": 14756 + }, + { + "epoch": 0.2171485213943499, + "grad_norm": 1.0293958187103271, + "learning_rate": 2.8334430361153185e-05, + "loss": 0.7038, + "step": 14787 + }, + { + "epoch": 0.21760375938469445, + "grad_norm": 1.2163504362106323, + "learning_rate": 2.8251179638406612e-05, + "loss": 0.7029, + "step": 14818 + }, + { + "epoch": 0.218058997375039, + "grad_norm": 1.0222349166870117, + "learning_rate": 2.8167892226539704e-05, + "loss": 0.7084, + "step": 14849 + }, + { + "epoch": 0.21851423536538356, + "grad_norm": 1.0532034635543823, + "learning_rate": 2.8084569065439588e-05, + "loss": 0.7162, + "step": 14880 + }, + { + "epoch": 0.2189694733557281, + "grad_norm": 0.9981489181518555, + "learning_rate": 2.8001211095396807e-05, + "loss": 0.7088, + "step": 14911 + }, + { + "epoch": 0.21942471134607267, + "grad_norm": 1.0804964303970337, + "learning_rate": 2.791781925709473e-05, + "loss": 0.7035, + "step": 14942 + }, + { + "epoch": 0.2198799493364172, + "grad_norm": 1.1326287984848022, + "learning_rate": 2.7834394491598908e-05, + "loss": 0.708, + "step": 14973 + }, + { + "epoch": 0.22033518732676174, + "grad_norm": 1.1164413690567017, + "learning_rate": 2.7750937740346485e-05, + "loss": 0.7047, + "step": 15004 + }, + { + "epoch": 0.2207904253171063, + "grad_norm": 1.1455166339874268, + "learning_rate": 2.7667449945135564e-05, + "loss": 0.708, + "step": 15035 + }, + { + "epoch": 0.22124566330745085, + "grad_norm": 1.1329716444015503, + "learning_rate": 2.7583932048114557e-05, + "loss": 0.6923, + "step": 15066 + }, + { + "epoch": 0.2217009012977954, + "grad_norm": 1.3479692935943604, + "learning_rate": 2.7500384991771587e-05, + "loss": 0.7119, + "step": 15097 + }, + { + "epoch": 0.22215613928813996, + "grad_norm": 1.1160681247711182, + "learning_rate": 2.7416809718923825e-05, + "loss": 0.7031, + "step": 15128 + }, + { + "epoch": 0.22261137727848448, + "grad_norm": 1.1188467741012573, + "learning_rate": 2.7333207172706864e-05, + "loss": 0.7029, + "step": 15159 + }, + { + "epoch": 0.22306661526882904, + "grad_norm": 0.9910608530044556, + "learning_rate": 2.7249578296564088e-05, + "loss": 0.6996, + "step": 15190 + }, + { + "epoch": 0.2235218532591736, + "grad_norm": 1.100176453590393, + "learning_rate": 2.7165924034235973e-05, + "loss": 0.7027, + "step": 15221 + }, + { + "epoch": 0.22397709124951815, + "grad_norm": 1.1848573684692383, + "learning_rate": 2.708224532974953e-05, + "loss": 0.7111, + "step": 15252 + }, + { + "epoch": 0.2244323292398627, + "grad_norm": 1.1293883323669434, + "learning_rate": 2.6998543127407538e-05, + "loss": 0.7045, + "step": 15283 + }, + { + "epoch": 0.22488756723020725, + "grad_norm": 1.1224740743637085, + "learning_rate": 2.6914818371777988e-05, + "loss": 0.7113, + "step": 15314 + }, + { + "epoch": 0.2253428052205518, + "grad_norm": 1.2419655323028564, + "learning_rate": 2.6831072007683373e-05, + "loss": 0.7062, + "step": 15345 + }, + { + "epoch": 0.22579804321089633, + "grad_norm": 1.1906564235687256, + "learning_rate": 2.6747304980190018e-05, + "loss": 0.7112, + "step": 15376 + }, + { + "epoch": 0.22625328120124089, + "grad_norm": 1.0610102415084839, + "learning_rate": 2.6663518234597453e-05, + "loss": 0.7052, + "step": 15407 + }, + { + "epoch": 0.22670851919158544, + "grad_norm": 1.3254245519638062, + "learning_rate": 2.6579712716427696e-05, + "loss": 0.6975, + "step": 15438 + }, + { + "epoch": 0.22716375718193, + "grad_norm": 1.1128469705581665, + "learning_rate": 2.6495889371414652e-05, + "loss": 0.6993, + "step": 15469 + }, + { + "epoch": 0.22761899517227455, + "grad_norm": 1.0977287292480469, + "learning_rate": 2.6412049145493367e-05, + "loss": 0.7001, + "step": 15500 + }, + { + "epoch": 0.2280742331626191, + "grad_norm": 0.9699016213417053, + "learning_rate": 2.632819298478939e-05, + "loss": 0.7082, + "step": 15531 + }, + { + "epoch": 0.22852947115296363, + "grad_norm": 1.1493170261383057, + "learning_rate": 2.6244321835608105e-05, + "loss": 0.7019, + "step": 15562 + }, + { + "epoch": 0.22898470914330818, + "grad_norm": 1.1549670696258545, + "learning_rate": 2.6160436644424024e-05, + "loss": 0.7087, + "step": 15593 + }, + { + "epoch": 0.22943994713365273, + "grad_norm": 1.2285927534103394, + "learning_rate": 2.6076538357870133e-05, + "loss": 0.695, + "step": 15624 + }, + { + "epoch": 0.2298951851239973, + "grad_norm": 1.0625406503677368, + "learning_rate": 2.5992627922727196e-05, + "loss": 0.7072, + "step": 15655 + }, + { + "epoch": 0.23035042311434184, + "grad_norm": 1.2031610012054443, + "learning_rate": 2.5908706285913066e-05, + "loss": 0.6952, + "step": 15686 + }, + { + "epoch": 0.2308056611046864, + "grad_norm": 1.0590460300445557, + "learning_rate": 2.5824774394472008e-05, + "loss": 0.6981, + "step": 15717 + }, + { + "epoch": 0.23126089909503092, + "grad_norm": 1.0085610151290894, + "learning_rate": 2.5740833195563996e-05, + "loss": 0.7006, + "step": 15748 + }, + { + "epoch": 0.23171613708537547, + "grad_norm": 1.1644418239593506, + "learning_rate": 2.5656883636454067e-05, + "loss": 0.7023, + "step": 15779 + }, + { + "epoch": 0.23217137507572003, + "grad_norm": 1.0243310928344727, + "learning_rate": 2.557292666450159e-05, + "loss": 0.7106, + "step": 15810 + }, + { + "epoch": 0.23262661306606458, + "grad_norm": 1.0970982313156128, + "learning_rate": 2.5488963227149566e-05, + "loss": 0.7018, + "step": 15841 + }, + { + "epoch": 0.23308185105640913, + "grad_norm": 1.0774227380752563, + "learning_rate": 2.5404994271913983e-05, + "loss": 0.7058, + "step": 15872 + }, + { + "epoch": 0.2335370890467537, + "grad_norm": 1.2018071413040161, + "learning_rate": 2.5321020746373085e-05, + "loss": 0.7072, + "step": 15903 + }, + { + "epoch": 0.2339923270370982, + "grad_norm": 1.1195406913757324, + "learning_rate": 2.52370435981567e-05, + "loss": 0.6971, + "step": 15934 + }, + { + "epoch": 0.23444756502744277, + "grad_norm": 1.0707147121429443, + "learning_rate": 2.5153063774935533e-05, + "loss": 0.7005, + "step": 15965 + }, + { + "epoch": 0.23490280301778732, + "grad_norm": 1.0488064289093018, + "learning_rate": 2.506908222441045e-05, + "loss": 0.6994, + "step": 15996 + }, + { + "epoch": 0.23535804100813187, + "grad_norm": 1.0699859857559204, + "learning_rate": 2.498509989430187e-05, + "loss": 0.6931, + "step": 16027 + }, + { + "epoch": 0.23581327899847643, + "grad_norm": 1.0461689233779907, + "learning_rate": 2.4901117732338958e-05, + "loss": 0.7022, + "step": 16058 + }, + { + "epoch": 0.23626851698882098, + "grad_norm": 1.096604824066162, + "learning_rate": 2.481713668624899e-05, + "loss": 0.7043, + "step": 16089 + }, + { + "epoch": 0.2367237549791655, + "grad_norm": 1.0687739849090576, + "learning_rate": 2.4733157703746663e-05, + "loss": 0.7043, + "step": 16120 + }, + { + "epoch": 0.23717899296951006, + "grad_norm": 1.1307755708694458, + "learning_rate": 2.4649181732523392e-05, + "loss": 0.7059, + "step": 16151 + }, + { + "epoch": 0.23763423095985461, + "grad_norm": 1.0404301881790161, + "learning_rate": 2.4565209720236582e-05, + "loss": 0.6981, + "step": 16182 + }, + { + "epoch": 0.23808946895019917, + "grad_norm": 1.0836886167526245, + "learning_rate": 2.4481242614498975e-05, + "loss": 0.7145, + "step": 16213 + }, + { + "epoch": 0.23854470694054372, + "grad_norm": 1.0622589588165283, + "learning_rate": 2.439728136286796e-05, + "loss": 0.7069, + "step": 16244 + }, + { + "epoch": 0.23899994493088828, + "grad_norm": 1.1610299348831177, + "learning_rate": 2.4313326912834852e-05, + "loss": 0.7022, + "step": 16275 + }, + { + "epoch": 0.2394551829212328, + "grad_norm": 1.004273772239685, + "learning_rate": 2.4229380211814206e-05, + "loss": 0.6905, + "step": 16306 + }, + { + "epoch": 0.23991042091157735, + "grad_norm": 1.0684071779251099, + "learning_rate": 2.4145442207133124e-05, + "loss": 0.6977, + "step": 16337 + }, + { + "epoch": 0.2403656589019219, + "grad_norm": 0.9177312850952148, + "learning_rate": 2.406151384602059e-05, + "loss": 0.6979, + "step": 16368 + }, + { + "epoch": 0.24082089689226646, + "grad_norm": 1.0734107494354248, + "learning_rate": 2.3977596075596747e-05, + "loss": 0.6987, + "step": 16399 + }, + { + "epoch": 0.24127613488261102, + "grad_norm": 1.1414164304733276, + "learning_rate": 2.3893689842862223e-05, + "loss": 0.6927, + "step": 16430 + }, + { + "epoch": 0.24173137287295557, + "grad_norm": 1.1547383069992065, + "learning_rate": 2.3809796094687475e-05, + "loss": 0.7053, + "step": 16461 + }, + { + "epoch": 0.2421866108633001, + "grad_norm": 1.0909677743911743, + "learning_rate": 2.372591577780202e-05, + "loss": 0.6987, + "step": 16492 + }, + { + "epoch": 0.24264184885364465, + "grad_norm": 1.0706005096435547, + "learning_rate": 2.3642049838783838e-05, + "loss": 0.7014, + "step": 16523 + }, + { + "epoch": 0.2430970868439892, + "grad_norm": 1.0389344692230225, + "learning_rate": 2.3558199224048666e-05, + "loss": 0.695, + "step": 16554 + }, + { + "epoch": 0.24355232483433376, + "grad_norm": 1.0836538076400757, + "learning_rate": 2.347436487983929e-05, + "loss": 0.7004, + "step": 16585 + }, + { + "epoch": 0.2440075628246783, + "grad_norm": 1.0748459100723267, + "learning_rate": 2.3390547752214888e-05, + "loss": 0.7018, + "step": 16616 + }, + { + "epoch": 0.24446280081502286, + "grad_norm": 1.097935438156128, + "learning_rate": 2.330674878704035e-05, + "loss": 0.706, + "step": 16647 + }, + { + "epoch": 0.24491803880536742, + "grad_norm": 1.1082520484924316, + "learning_rate": 2.322296892997561e-05, + "loss": 0.7012, + "step": 16678 + }, + { + "epoch": 0.24537327679571194, + "grad_norm": 1.0682934522628784, + "learning_rate": 2.313920912646497e-05, + "loss": 0.701, + "step": 16709 + }, + { + "epoch": 0.2458285147860565, + "grad_norm": 1.1116893291473389, + "learning_rate": 2.305547032172643e-05, + "loss": 0.7038, + "step": 16740 + }, + { + "epoch": 0.24628375277640105, + "grad_norm": 1.0376949310302734, + "learning_rate": 2.2971753460741014e-05, + "loss": 0.6998, + "step": 16771 + }, + { + "epoch": 0.2467389907667456, + "grad_norm": 1.0389093160629272, + "learning_rate": 2.288805948824212e-05, + "loss": 0.7043, + "step": 16802 + }, + { + "epoch": 0.24719422875709016, + "grad_norm": 1.0645474195480347, + "learning_rate": 2.2804389348704858e-05, + "loss": 0.6947, + "step": 16833 + }, + { + "epoch": 0.2476494667474347, + "grad_norm": 1.0893995761871338, + "learning_rate": 2.2720743986335374e-05, + "loss": 0.6981, + "step": 16864 + }, + { + "epoch": 0.24810470473777924, + "grad_norm": 1.022275447845459, + "learning_rate": 2.2637124345060233e-05, + "loss": 0.7081, + "step": 16895 + }, + { + "epoch": 0.2485599427281238, + "grad_norm": 1.1055867671966553, + "learning_rate": 2.2553531368515695e-05, + "loss": 0.6942, + "step": 16926 + }, + { + "epoch": 0.24901518071846834, + "grad_norm": 1.0815192461013794, + "learning_rate": 2.2469966000037144e-05, + "loss": 0.6951, + "step": 16957 + }, + { + "epoch": 0.2494704187088129, + "grad_norm": 1.0612388849258423, + "learning_rate": 2.2386429182648417e-05, + "loss": 0.6991, + "step": 16988 + }, + { + "epoch": 0.24992565669915745, + "grad_norm": 1.0434961318969727, + "learning_rate": 2.230292185905114e-05, + "loss": 0.6904, + "step": 17019 + }, + { + "epoch": 0.250380894689502, + "grad_norm": 1.0427175760269165, + "learning_rate": 2.2219444971614116e-05, + "loss": 0.6979, + "step": 17050 + }, + { + "epoch": 0.25083613267984656, + "grad_norm": 1.0715687274932861, + "learning_rate": 2.2135999462362655e-05, + "loss": 0.7034, + "step": 17081 + }, + { + "epoch": 0.2512913706701911, + "grad_norm": 1.0116679668426514, + "learning_rate": 2.2052586272968003e-05, + "loss": 0.6996, + "step": 17112 + }, + { + "epoch": 0.25174660866053566, + "grad_norm": 1.2103781700134277, + "learning_rate": 2.196920634473666e-05, + "loss": 0.7026, + "step": 17143 + }, + { + "epoch": 0.2522018466508802, + "grad_norm": 1.0434819459915161, + "learning_rate": 2.1885860618599787e-05, + "loss": 0.6979, + "step": 17174 + }, + { + "epoch": 0.2526570846412247, + "grad_norm": 1.2911967039108276, + "learning_rate": 2.1802550035102577e-05, + "loss": 0.6866, + "step": 17205 + }, + { + "epoch": 0.2531123226315693, + "grad_norm": 1.1720303297042847, + "learning_rate": 2.171927553439363e-05, + "loss": 0.6868, + "step": 17236 + }, + { + "epoch": 0.2535675606219138, + "grad_norm": 1.0302678346633911, + "learning_rate": 2.1636038056214376e-05, + "loss": 0.691, + "step": 17267 + }, + { + "epoch": 0.2540227986122584, + "grad_norm": 1.0190601348876953, + "learning_rate": 2.155283853988844e-05, + "loss": 0.6964, + "step": 17298 + }, + { + "epoch": 0.25447803660260293, + "grad_norm": 1.109703540802002, + "learning_rate": 2.146967792431106e-05, + "loss": 0.693, + "step": 17329 + }, + { + "epoch": 0.25493327459294746, + "grad_norm": 1.160040020942688, + "learning_rate": 2.138655714793849e-05, + "loss": 0.6943, + "step": 17360 + }, + { + "epoch": 0.25538851258329204, + "grad_norm": 1.083268404006958, + "learning_rate": 2.1303477148777367e-05, + "loss": 0.7024, + "step": 17391 + }, + { + "epoch": 0.25584375057363656, + "grad_norm": 1.0631040334701538, + "learning_rate": 2.122043886437421e-05, + "loss": 0.6896, + "step": 17422 + }, + { + "epoch": 0.25629898856398114, + "grad_norm": 1.2141170501708984, + "learning_rate": 2.1137443231804765e-05, + "loss": 0.7005, + "step": 17453 + }, + { + "epoch": 0.25675422655432567, + "grad_norm": 1.082511067390442, + "learning_rate": 2.105449118766347e-05, + "loss": 0.6906, + "step": 17484 + }, + { + "epoch": 0.25720946454467025, + "grad_norm": 0.9919353127479553, + "learning_rate": 2.097158366805287e-05, + "loss": 0.7017, + "step": 17515 + }, + { + "epoch": 0.2576647025350148, + "grad_norm": 1.0450084209442139, + "learning_rate": 2.0888721608573047e-05, + "loss": 0.696, + "step": 17546 + }, + { + "epoch": 0.2581199405253593, + "grad_norm": 1.0460536479949951, + "learning_rate": 2.0805905944311087e-05, + "loss": 0.6947, + "step": 17577 + }, + { + "epoch": 0.2585751785157039, + "grad_norm": 1.0822510719299316, + "learning_rate": 2.0723137609830497e-05, + "loss": 0.7039, + "step": 17608 + }, + { + "epoch": 0.2590304165060484, + "grad_norm": 1.0411216020584106, + "learning_rate": 2.0640417539160686e-05, + "loss": 0.6962, + "step": 17639 + }, + { + "epoch": 0.259485654496393, + "grad_norm": 1.0115315914154053, + "learning_rate": 2.0557746665786427e-05, + "loss": 0.6954, + "step": 17670 + }, + { + "epoch": 0.2599408924867375, + "grad_norm": 1.0552514791488647, + "learning_rate": 2.0475125922637256e-05, + "loss": 0.6881, + "step": 17701 + }, + { + "epoch": 0.26039613047708204, + "grad_norm": 0.9966985583305359, + "learning_rate": 2.0392556242077047e-05, + "loss": 0.7012, + "step": 17732 + }, + { + "epoch": 0.2608513684674266, + "grad_norm": 1.113692045211792, + "learning_rate": 2.031003855589343e-05, + "loss": 0.703, + "step": 17763 + }, + { + "epoch": 0.26130660645777115, + "grad_norm": 1.0169728994369507, + "learning_rate": 2.022757379528727e-05, + "loss": 0.7008, + "step": 17794 + }, + { + "epoch": 0.26176184444811573, + "grad_norm": 1.1313414573669434, + "learning_rate": 2.0145162890862184e-05, + "loss": 0.6956, + "step": 17825 + }, + { + "epoch": 0.26221708243846026, + "grad_norm": 0.9456464052200317, + "learning_rate": 2.0062806772614022e-05, + "loss": 0.694, + "step": 17856 + }, + { + "epoch": 0.26267232042880484, + "grad_norm": 1.0825542211532593, + "learning_rate": 1.9980506369920392e-05, + "loss": 0.6915, + "step": 17887 + }, + { + "epoch": 0.26312755841914937, + "grad_norm": 1.059581995010376, + "learning_rate": 1.989826261153015e-05, + "loss": 0.6922, + "step": 17918 + }, + { + "epoch": 0.2635827964094939, + "grad_norm": 1.0134432315826416, + "learning_rate": 1.9816076425552923e-05, + "loss": 0.6952, + "step": 17949 + }, + { + "epoch": 0.2640380343998385, + "grad_norm": 0.9800439476966858, + "learning_rate": 1.9733948739448676e-05, + "loss": 0.7036, + "step": 17980 + }, + { + "epoch": 0.264493272390183, + "grad_norm": 1.128818392753601, + "learning_rate": 1.9651880480017155e-05, + "loss": 0.6916, + "step": 18011 + }, + { + "epoch": 0.2649485103805276, + "grad_norm": 1.0002161264419556, + "learning_rate": 1.9569872573387516e-05, + "loss": 0.6886, + "step": 18042 + }, + { + "epoch": 0.2654037483708721, + "grad_norm": 1.1037601232528687, + "learning_rate": 1.9487925945007854e-05, + "loss": 0.6954, + "step": 18073 + }, + { + "epoch": 0.2658589863612167, + "grad_norm": 1.0204657316207886, + "learning_rate": 1.9406041519634726e-05, + "loss": 0.6976, + "step": 18104 + }, + { + "epoch": 0.2663142243515612, + "grad_norm": 1.0254517793655396, + "learning_rate": 1.932422022132275e-05, + "loss": 0.697, + "step": 18135 + }, + { + "epoch": 0.26676946234190574, + "grad_norm": 1.0792242288589478, + "learning_rate": 1.924246297341414e-05, + "loss": 0.6932, + "step": 18166 + }, + { + "epoch": 0.2672247003322503, + "grad_norm": 1.2440094947814941, + "learning_rate": 1.9160770698528338e-05, + "loss": 0.6925, + "step": 18197 + }, + { + "epoch": 0.26767993832259485, + "grad_norm": 1.0181853771209717, + "learning_rate": 1.907914431855156e-05, + "loss": 0.6854, + "step": 18228 + }, + { + "epoch": 0.2681351763129394, + "grad_norm": 0.982681930065155, + "learning_rate": 1.8997584754626412e-05, + "loss": 0.6892, + "step": 18259 + }, + { + "epoch": 0.26859041430328395, + "grad_norm": 1.1587820053100586, + "learning_rate": 1.8916092927141486e-05, + "loss": 0.6955, + "step": 18290 + }, + { + "epoch": 0.2690456522936285, + "grad_norm": 1.0297470092773438, + "learning_rate": 1.883466975572098e-05, + "loss": 0.6921, + "step": 18321 + }, + { + "epoch": 0.26950089028397306, + "grad_norm": 1.0646672248840332, + "learning_rate": 1.8753316159214312e-05, + "loss": 0.6966, + "step": 18352 + }, + { + "epoch": 0.2699561282743176, + "grad_norm": 1.0070273876190186, + "learning_rate": 1.8672033055685766e-05, + "loss": 0.7005, + "step": 18383 + }, + { + "epoch": 0.27041136626466217, + "grad_norm": 0.9793278574943542, + "learning_rate": 1.8590821362404116e-05, + "loss": 0.6894, + "step": 18414 + }, + { + "epoch": 0.2708666042550067, + "grad_norm": 1.0349115133285522, + "learning_rate": 1.8509681995832294e-05, + "loss": 0.6906, + "step": 18445 + }, + { + "epoch": 0.2713218422453513, + "grad_norm": 1.0271046161651611, + "learning_rate": 1.8428615871617004e-05, + "loss": 0.6916, + "step": 18476 + }, + { + "epoch": 0.2717770802356958, + "grad_norm": 0.9766640663146973, + "learning_rate": 1.8347623904578448e-05, + "loss": 0.692, + "step": 18507 + }, + { + "epoch": 0.2722323182260403, + "grad_norm": 1.0498918294906616, + "learning_rate": 1.8266707008699975e-05, + "loss": 0.687, + "step": 18538 + }, + { + "epoch": 0.2726875562163849, + "grad_norm": 0.970116138458252, + "learning_rate": 1.818586609711774e-05, + "loss": 0.6923, + "step": 18569 + }, + { + "epoch": 0.27314279420672943, + "grad_norm": 1.1822494268417358, + "learning_rate": 1.8105102082110462e-05, + "loss": 0.6899, + "step": 18600 + }, + { + "epoch": 0.273598032197074, + "grad_norm": 1.0538249015808105, + "learning_rate": 1.8024415875089058e-05, + "loss": 0.6912, + "step": 18631 + }, + { + "epoch": 0.27405327018741854, + "grad_norm": 1.123678207397461, + "learning_rate": 1.7943808386586407e-05, + "loss": 0.6918, + "step": 18662 + }, + { + "epoch": 0.27450850817776307, + "grad_norm": 1.0302077531814575, + "learning_rate": 1.7863280526247073e-05, + "loss": 0.6872, + "step": 18693 + }, + { + "epoch": 0.27496374616810765, + "grad_norm": 1.0867012739181519, + "learning_rate": 1.7782833202817003e-05, + "loss": 0.7006, + "step": 18724 + }, + { + "epoch": 0.2754189841584522, + "grad_norm": 1.0516695976257324, + "learning_rate": 1.7702467324133327e-05, + "loss": 0.6969, + "step": 18755 + }, + { + "epoch": 0.27587422214879675, + "grad_norm": 1.083567500114441, + "learning_rate": 1.7622183797114042e-05, + "loss": 0.6936, + "step": 18786 + }, + { + "epoch": 0.2763294601391413, + "grad_norm": 1.0399643182754517, + "learning_rate": 1.7541983527747838e-05, + "loss": 0.6887, + "step": 18817 + }, + { + "epoch": 0.27678469812948586, + "grad_norm": 1.1514192819595337, + "learning_rate": 1.746186742108387e-05, + "loss": 0.6882, + "step": 18848 + }, + { + "epoch": 0.2772399361198304, + "grad_norm": 1.1234108209609985, + "learning_rate": 1.73818363812215e-05, + "loss": 0.6909, + "step": 18879 + }, + { + "epoch": 0.2776951741101749, + "grad_norm": 1.0432260036468506, + "learning_rate": 1.7301891311300153e-05, + "loss": 0.6826, + "step": 18910 + }, + { + "epoch": 0.2781504121005195, + "grad_norm": 1.2708081007003784, + "learning_rate": 1.7222033113489055e-05, + "loss": 0.694, + "step": 18941 + }, + { + "epoch": 0.278605650090864, + "grad_norm": 0.9991064667701721, + "learning_rate": 1.7142262688977127e-05, + "loss": 0.7001, + "step": 18972 + }, + { + "epoch": 0.2790608880812086, + "grad_norm": 1.103553295135498, + "learning_rate": 1.7062580937962764e-05, + "loss": 0.6974, + "step": 19003 + }, + { + "epoch": 0.27951612607155313, + "grad_norm": 1.0002790689468384, + "learning_rate": 1.698298875964369e-05, + "loss": 0.6951, + "step": 19034 + }, + { + "epoch": 0.27997136406189765, + "grad_norm": 1.0627328157424927, + "learning_rate": 1.690348705220684e-05, + "loss": 0.6914, + "step": 19065 + }, + { + "epoch": 0.28042660205224224, + "grad_norm": 1.152733325958252, + "learning_rate": 1.6824076712818156e-05, + "loss": 0.6909, + "step": 19096 + }, + { + "epoch": 0.28088184004258676, + "grad_norm": 1.1142559051513672, + "learning_rate": 1.6744758637612533e-05, + "loss": 0.6878, + "step": 19127 + }, + { + "epoch": 0.28133707803293134, + "grad_norm": 1.022026538848877, + "learning_rate": 1.6665533721683664e-05, + "loss": 0.6876, + "step": 19158 + }, + { + "epoch": 0.28179231602327587, + "grad_norm": 1.117065668106079, + "learning_rate": 1.6586402859073974e-05, + "loss": 0.6878, + "step": 19189 + }, + { + "epoch": 0.28224755401362045, + "grad_norm": 0.9499729871749878, + "learning_rate": 1.6507366942764463e-05, + "loss": 0.6888, + "step": 19220 + }, + { + "epoch": 0.282702792003965, + "grad_norm": 1.111111044883728, + "learning_rate": 1.6428426864664732e-05, + "loss": 0.6898, + "step": 19251 + }, + { + "epoch": 0.2831580299943095, + "grad_norm": 1.1620928049087524, + "learning_rate": 1.6349583515602816e-05, + "loss": 0.6948, + "step": 19282 + }, + { + "epoch": 0.2836132679846541, + "grad_norm": 1.1431219577789307, + "learning_rate": 1.6270837785315208e-05, + "loss": 0.6929, + "step": 19313 + }, + { + "epoch": 0.2840685059749986, + "grad_norm": 1.1274683475494385, + "learning_rate": 1.619219056243676e-05, + "loss": 0.6823, + "step": 19344 + }, + { + "epoch": 0.2845237439653432, + "grad_norm": 1.1499154567718506, + "learning_rate": 1.6113642734490698e-05, + "loss": 0.6838, + "step": 19375 + }, + { + "epoch": 0.2849789819556877, + "grad_norm": 1.0493180751800537, + "learning_rate": 1.6035195187878577e-05, + "loss": 0.6867, + "step": 19406 + }, + { + "epoch": 0.2854342199460323, + "grad_norm": 0.9728123545646667, + "learning_rate": 1.5956848807870305e-05, + "loss": 0.6889, + "step": 19437 + }, + { + "epoch": 0.2858894579363768, + "grad_norm": 1.0137308835983276, + "learning_rate": 1.587860447859413e-05, + "loss": 0.6892, + "step": 19468 + }, + { + "epoch": 0.28634469592672135, + "grad_norm": 1.0865050554275513, + "learning_rate": 1.5800463083026686e-05, + "loss": 0.6841, + "step": 19499 + }, + { + "epoch": 0.28679993391706593, + "grad_norm": 1.0522550344467163, + "learning_rate": 1.572242550298298e-05, + "loss": 0.6905, + "step": 19530 + }, + { + "epoch": 0.28725517190741046, + "grad_norm": 1.1563197374343872, + "learning_rate": 1.56444926191065e-05, + "loss": 0.6811, + "step": 19561 + }, + { + "epoch": 0.28771040989775504, + "grad_norm": 0.962688684463501, + "learning_rate": 1.5566665310859257e-05, + "loss": 0.6898, + "step": 19592 + }, + { + "epoch": 0.28816564788809956, + "grad_norm": 1.0998531579971313, + "learning_rate": 1.5488944456511846e-05, + "loss": 0.6909, + "step": 19623 + }, + { + "epoch": 0.2886208858784441, + "grad_norm": 1.1609821319580078, + "learning_rate": 1.5411330933133546e-05, + "loss": 0.6844, + "step": 19654 + }, + { + "epoch": 0.28907612386878867, + "grad_norm": 0.9745819568634033, + "learning_rate": 1.533382561658241e-05, + "loss": 0.6933, + "step": 19685 + }, + { + "epoch": 0.2895313618591332, + "grad_norm": 1.085925817489624, + "learning_rate": 1.525642938149541e-05, + "loss": 0.6894, + "step": 19716 + }, + { + "epoch": 0.2899865998494778, + "grad_norm": 1.0314606428146362, + "learning_rate": 1.5179143101278536e-05, + "loss": 0.6965, + "step": 19747 + }, + { + "epoch": 0.2904418378398223, + "grad_norm": 1.0771900415420532, + "learning_rate": 1.5101967648096955e-05, + "loss": 0.6904, + "step": 19778 + }, + { + "epoch": 0.2908970758301669, + "grad_norm": 0.9729062914848328, + "learning_rate": 1.5024903892865172e-05, + "loss": 0.6886, + "step": 19809 + }, + { + "epoch": 0.2913523138205114, + "grad_norm": 1.0824676752090454, + "learning_rate": 1.4947952705237184e-05, + "loss": 0.6804, + "step": 19840 + }, + { + "epoch": 0.29180755181085594, + "grad_norm": 1.0260144472122192, + "learning_rate": 1.4871114953596682e-05, + "loss": 0.6905, + "step": 19871 + }, + { + "epoch": 0.2922627898012005, + "grad_norm": 0.9324101209640503, + "learning_rate": 1.4794391505047256e-05, + "loss": 0.6877, + "step": 19902 + }, + { + "epoch": 0.29271802779154504, + "grad_norm": 1.0553687810897827, + "learning_rate": 1.4717783225402596e-05, + "loss": 0.6928, + "step": 19933 + }, + { + "epoch": 0.2931732657818896, + "grad_norm": 1.129400610923767, + "learning_rate": 1.4641290979176735e-05, + "loss": 0.6905, + "step": 19964 + }, + { + "epoch": 0.29362850377223415, + "grad_norm": 1.064041018486023, + "learning_rate": 1.4564915629574246e-05, + "loss": 0.6936, + "step": 19995 + }, + { + "epoch": 0.2940837417625787, + "grad_norm": 1.116929292678833, + "learning_rate": 1.4488658038480601e-05, + "loss": 0.6818, + "step": 20026 + }, + { + "epoch": 0.29453897975292326, + "grad_norm": 1.0334928035736084, + "learning_rate": 1.4412519066452323e-05, + "loss": 0.6887, + "step": 20057 + }, + { + "epoch": 0.2949942177432678, + "grad_norm": 1.0690734386444092, + "learning_rate": 1.4336499572707373e-05, + "loss": 0.6885, + "step": 20088 + }, + { + "epoch": 0.29544945573361237, + "grad_norm": 1.1211203336715698, + "learning_rate": 1.4260600415115433e-05, + "loss": 0.6919, + "step": 20119 + }, + { + "epoch": 0.2959046937239569, + "grad_norm": 0.9984875917434692, + "learning_rate": 1.4184822450188137e-05, + "loss": 0.6892, + "step": 20150 + }, + { + "epoch": 0.29635993171430147, + "grad_norm": 1.0159475803375244, + "learning_rate": 1.410916653306954e-05, + "loss": 0.682, + "step": 20181 + }, + { + "epoch": 0.296815169704646, + "grad_norm": 0.9778633117675781, + "learning_rate": 1.403363351752639e-05, + "loss": 0.6808, + "step": 20212 + }, + { + "epoch": 0.2972704076949905, + "grad_norm": 1.1207058429718018, + "learning_rate": 1.3958224255938485e-05, + "loss": 0.6852, + "step": 20243 + }, + { + "epoch": 0.2977256456853351, + "grad_norm": 1.0286227464675903, + "learning_rate": 1.388293959928911e-05, + "loss": 0.6886, + "step": 20274 + }, + { + "epoch": 0.29818088367567963, + "grad_norm": 1.0112954378128052, + "learning_rate": 1.3807780397155379e-05, + "loss": 0.6878, + "step": 20305 + }, + { + "epoch": 0.2986361216660242, + "grad_norm": 1.0683724880218506, + "learning_rate": 1.3732747497698655e-05, + "loss": 0.6889, + "step": 20336 + }, + { + "epoch": 0.29909135965636874, + "grad_norm": 1.0744072198867798, + "learning_rate": 1.3657841747655038e-05, + "loss": 0.6791, + "step": 20367 + }, + { + "epoch": 0.2995465976467133, + "grad_norm": 1.0279752016067505, + "learning_rate": 1.3583063992325706e-05, + "loss": 0.684, + "step": 20398 + }, + { + "epoch": 0.30000183563705785, + "grad_norm": 0.9995334148406982, + "learning_rate": 1.3508415075567496e-05, + "loss": 0.6906, + "step": 20429 + }, + { + "epoch": 0.30045707362740237, + "grad_norm": 1.351607322692871, + "learning_rate": 1.343389583978327e-05, + "loss": 0.6964, + "step": 20460 + }, + { + "epoch": 0.30091231161774695, + "grad_norm": 1.0838359594345093, + "learning_rate": 1.3359507125912468e-05, + "loss": 0.6784, + "step": 20491 + }, + { + "epoch": 0.3013675496080915, + "grad_norm": 1.0536307096481323, + "learning_rate": 1.3285249773421627e-05, + "loss": 0.6872, + "step": 20522 + }, + { + "epoch": 0.30182278759843606, + "grad_norm": 0.9636529088020325, + "learning_rate": 1.3211124620294884e-05, + "loss": 0.6914, + "step": 20553 + }, + { + "epoch": 0.3022780255887806, + "grad_norm": 1.1852017641067505, + "learning_rate": 1.313713250302451e-05, + "loss": 0.6821, + "step": 20584 + }, + { + "epoch": 0.3027332635791251, + "grad_norm": 1.072434425354004, + "learning_rate": 1.3063274256601479e-05, + "loss": 0.695, + "step": 20615 + }, + { + "epoch": 0.3031885015694697, + "grad_norm": 1.2345269918441772, + "learning_rate": 1.2989550714506086e-05, + "loss": 0.6824, + "step": 20646 + }, + { + "epoch": 0.3036437395598142, + "grad_norm": 1.0516636371612549, + "learning_rate": 1.291596270869846e-05, + "loss": 0.6854, + "step": 20677 + }, + { + "epoch": 0.3040989775501588, + "grad_norm": 1.0413544178009033, + "learning_rate": 1.284251106960927e-05, + "loss": 0.6895, + "step": 20708 + }, + { + "epoch": 0.3045542155405033, + "grad_norm": 1.158065676689148, + "learning_rate": 1.2769196626130263e-05, + "loss": 0.6861, + "step": 20739 + }, + { + "epoch": 0.3050094535308479, + "grad_norm": 1.0109269618988037, + "learning_rate": 1.2696020205604969e-05, + "loss": 0.6898, + "step": 20770 + }, + { + "epoch": 0.30546469152119243, + "grad_norm": 0.9886858463287354, + "learning_rate": 1.2622982633819359e-05, + "loss": 0.6874, + "step": 20801 + }, + { + "epoch": 0.30591992951153696, + "grad_norm": 1.0234347581863403, + "learning_rate": 1.2550084734992484e-05, + "loss": 0.6823, + "step": 20832 + }, + { + "epoch": 0.30637516750188154, + "grad_norm": 1.028950810432434, + "learning_rate": 1.247732733176724e-05, + "loss": 0.6884, + "step": 20863 + }, + { + "epoch": 0.30683040549222607, + "grad_norm": 1.1941654682159424, + "learning_rate": 1.2404711245201044e-05, + "loss": 0.7019, + "step": 20894 + }, + { + "epoch": 0.30728564348257065, + "grad_norm": 1.0201176404953003, + "learning_rate": 1.2332237294756535e-05, + "loss": 0.6846, + "step": 20925 + }, + { + "epoch": 0.3077408814729152, + "grad_norm": 0.9765841364860535, + "learning_rate": 1.225990629829241e-05, + "loss": 0.6881, + "step": 20956 + }, + { + "epoch": 0.3081961194632597, + "grad_norm": 1.0036793947219849, + "learning_rate": 1.2187719072054136e-05, + "loss": 0.6849, + "step": 20987 + }, + { + "epoch": 0.3086513574536043, + "grad_norm": 1.1151163578033447, + "learning_rate": 1.2115676430664735e-05, + "loss": 0.6825, + "step": 21018 + }, + { + "epoch": 0.3091065954439488, + "grad_norm": 1.0734307765960693, + "learning_rate": 1.2043779187115647e-05, + "loss": 0.6902, + "step": 21049 + }, + { + "epoch": 0.3095618334342934, + "grad_norm": 0.9811964631080627, + "learning_rate": 1.1972028152757476e-05, + "loss": 0.6883, + "step": 21080 + }, + { + "epoch": 0.3100170714246379, + "grad_norm": 1.0949833393096924, + "learning_rate": 1.1900424137290889e-05, + "loss": 0.6873, + "step": 21111 + }, + { + "epoch": 0.3104723094149825, + "grad_norm": 1.0459587574005127, + "learning_rate": 1.1828967948757482e-05, + "loss": 0.6853, + "step": 21142 + }, + { + "epoch": 0.310927547405327, + "grad_norm": 1.1628592014312744, + "learning_rate": 1.175766039353062e-05, + "loss": 0.6837, + "step": 21173 + }, + { + "epoch": 0.31138278539567155, + "grad_norm": 0.9916526079177856, + "learning_rate": 1.1686502276306382e-05, + "loss": 0.6838, + "step": 21204 + }, + { + "epoch": 0.3118380233860161, + "grad_norm": 0.9945309162139893, + "learning_rate": 1.1615494400094445e-05, + "loss": 0.6811, + "step": 21235 + }, + { + "epoch": 0.31229326137636065, + "grad_norm": 1.0234261751174927, + "learning_rate": 1.1544637566209029e-05, + "loss": 0.6833, + "step": 21266 + }, + { + "epoch": 0.31274849936670523, + "grad_norm": 0.999071478843689, + "learning_rate": 1.1473932574259886e-05, + "loss": 0.6858, + "step": 21297 + }, + { + "epoch": 0.31320373735704976, + "grad_norm": 1.0478752851486206, + "learning_rate": 1.1403380222143247e-05, + "loss": 0.6918, + "step": 21328 + }, + { + "epoch": 0.3136589753473943, + "grad_norm": 1.083009958267212, + "learning_rate": 1.1332981306032808e-05, + "loss": 0.6758, + "step": 21359 + }, + { + "epoch": 0.31411421333773887, + "grad_norm": 0.9705089926719666, + "learning_rate": 1.1262736620370762e-05, + "loss": 0.6784, + "step": 21390 + }, + { + "epoch": 0.3145694513280834, + "grad_norm": 0.9727345108985901, + "learning_rate": 1.1192646957858854e-05, + "loss": 0.6902, + "step": 21421 + }, + { + "epoch": 0.315024689318428, + "grad_norm": 1.1719439029693604, + "learning_rate": 1.1122713109449381e-05, + "loss": 0.6802, + "step": 21452 + }, + { + "epoch": 0.3154799273087725, + "grad_norm": 1.061924695968628, + "learning_rate": 1.105293586433634e-05, + "loss": 0.6829, + "step": 21483 + }, + { + "epoch": 0.3159351652991171, + "grad_norm": 0.965242326259613, + "learning_rate": 1.0983316009946446e-05, + "loss": 0.6795, + "step": 21514 + }, + { + "epoch": 0.3163904032894616, + "grad_norm": 1.0916402339935303, + "learning_rate": 1.0913854331930282e-05, + "loss": 0.689, + "step": 21545 + }, + { + "epoch": 0.31684564127980613, + "grad_norm": 1.088815450668335, + "learning_rate": 1.0844551614153456e-05, + "loss": 0.6845, + "step": 21576 + }, + { + "epoch": 0.3173008792701507, + "grad_norm": 1.052106499671936, + "learning_rate": 1.0775408638687725e-05, + "loss": 0.68, + "step": 21607 + }, + { + "epoch": 0.31775611726049524, + "grad_norm": 0.9820737242698669, + "learning_rate": 1.0706426185802165e-05, + "loss": 0.6872, + "step": 21638 + }, + { + "epoch": 0.3182113552508398, + "grad_norm": 1.014233946800232, + "learning_rate": 1.0637605033954371e-05, + "loss": 0.6891, + "step": 21669 + }, + { + "epoch": 0.31866659324118435, + "grad_norm": 1.098426103591919, + "learning_rate": 1.05689459597817e-05, + "loss": 0.6878, + "step": 21700 + }, + { + "epoch": 0.31912183123152893, + "grad_norm": 1.0153820514678955, + "learning_rate": 1.050044973809246e-05, + "loss": 0.6793, + "step": 21731 + }, + { + "epoch": 0.31957706922187346, + "grad_norm": 1.07473886013031, + "learning_rate": 1.043211714185722e-05, + "loss": 0.6892, + "step": 21762 + }, + { + "epoch": 0.320032307212218, + "grad_norm": 1.108799934387207, + "learning_rate": 1.036394894220003e-05, + "loss": 0.6819, + "step": 21793 + }, + { + "epoch": 0.32048754520256256, + "grad_norm": 1.105481743812561, + "learning_rate": 1.0295945908389751e-05, + "loss": 0.6871, + "step": 21824 + }, + { + "epoch": 0.3209427831929071, + "grad_norm": 1.106384038925171, + "learning_rate": 1.0228108807831393e-05, + "loss": 0.6789, + "step": 21855 + }, + { + "epoch": 0.32139802118325167, + "grad_norm": 1.0353236198425293, + "learning_rate": 1.01604384060574e-05, + "loss": 0.6822, + "step": 21886 + }, + { + "epoch": 0.3218532591735962, + "grad_norm": 1.086665153503418, + "learning_rate": 1.009293546671907e-05, + "loss": 0.6804, + "step": 21917 + }, + { + "epoch": 0.3223084971639407, + "grad_norm": 0.982736349105835, + "learning_rate": 1.002560075157791e-05, + "loss": 0.6838, + "step": 21948 + }, + { + "epoch": 0.3227637351542853, + "grad_norm": 1.1834845542907715, + "learning_rate": 9.958435020496995e-06, + "loss": 0.6881, + "step": 21979 + }, + { + "epoch": 0.32321897314462983, + "grad_norm": 1.0267136096954346, + "learning_rate": 9.89143903143249e-06, + "loss": 0.6814, + "step": 22010 + }, + { + "epoch": 0.3236742111349744, + "grad_norm": 1.0347238779067993, + "learning_rate": 9.824613540425038e-06, + "loss": 0.6806, + "step": 22041 + }, + { + "epoch": 0.32412944912531894, + "grad_norm": 1.0527032613754272, + "learning_rate": 9.757959301591197e-06, + "loss": 0.6791, + "step": 22072 + }, + { + "epoch": 0.3245846871156635, + "grad_norm": 1.0493974685668945, + "learning_rate": 9.691477067115017e-06, + "loss": 0.69, + "step": 22103 + }, + { + "epoch": 0.32503992510600804, + "grad_norm": 1.0073857307434082, + "learning_rate": 9.625167587239467e-06, + "loss": 0.6781, + "step": 22134 + }, + { + "epoch": 0.32549516309635257, + "grad_norm": 0.9913318157196045, + "learning_rate": 9.559031610258007e-06, + "loss": 0.6825, + "step": 22165 + }, + { + "epoch": 0.32595040108669715, + "grad_norm": 1.0183086395263672, + "learning_rate": 9.493069882506164e-06, + "loss": 0.6805, + "step": 22196 + }, + { + "epoch": 0.3264056390770417, + "grad_norm": 1.0087958574295044, + "learning_rate": 9.427283148353056e-06, + "loss": 0.6775, + "step": 22227 + }, + { + "epoch": 0.32686087706738626, + "grad_norm": 1.1444545984268188, + "learning_rate": 9.361672150193052e-06, + "loss": 0.684, + "step": 22258 + }, + { + "epoch": 0.3273161150577308, + "grad_norm": 1.0593520402908325, + "learning_rate": 9.29623762843734e-06, + "loss": 0.6874, + "step": 22289 + }, + { + "epoch": 0.3277713530480753, + "grad_norm": 1.0576874017715454, + "learning_rate": 9.230980321505594e-06, + "loss": 0.6789, + "step": 22320 + }, + { + "epoch": 0.3282265910384199, + "grad_norm": 0.9419311285018921, + "learning_rate": 9.165900965817668e-06, + "loss": 0.6727, + "step": 22351 + }, + { + "epoch": 0.3286818290287644, + "grad_norm": 1.116232991218567, + "learning_rate": 9.101000295785245e-06, + "loss": 0.6868, + "step": 22382 + }, + { + "epoch": 0.329137067019109, + "grad_norm": 1.1253653764724731, + "learning_rate": 9.036279043803565e-06, + "loss": 0.6782, + "step": 22413 + }, + { + "epoch": 0.3295923050094535, + "grad_norm": 1.1574054956436157, + "learning_rate": 8.971737940243147e-06, + "loss": 0.6801, + "step": 22444 + }, + { + "epoch": 0.3300475429997981, + "grad_norm": 0.9954296946525574, + "learning_rate": 8.907377713441592e-06, + "loss": 0.6815, + "step": 22475 + }, + { + "epoch": 0.33050278099014263, + "grad_norm": 1.0231496095657349, + "learning_rate": 8.843199089695293e-06, + "loss": 0.6828, + "step": 22506 + }, + { + "epoch": 0.33095801898048716, + "grad_norm": 1.0164254903793335, + "learning_rate": 8.779202793251311e-06, + "loss": 0.6806, + "step": 22537 + }, + { + "epoch": 0.33141325697083174, + "grad_norm": 0.9951460361480713, + "learning_rate": 8.715389546299149e-06, + "loss": 0.6838, + "step": 22568 + }, + { + "epoch": 0.33186849496117626, + "grad_norm": 1.117965817451477, + "learning_rate": 8.651760068962617e-06, + "loss": 0.6726, + "step": 22599 + }, + { + "epoch": 0.33232373295152084, + "grad_norm": 0.9313582181930542, + "learning_rate": 8.588315079291733e-06, + "loss": 0.6807, + "step": 22630 + }, + { + "epoch": 0.33277897094186537, + "grad_norm": 1.0607514381408691, + "learning_rate": 8.52505529325457e-06, + "loss": 0.6757, + "step": 22661 + }, + { + "epoch": 0.3332342089322099, + "grad_norm": 0.9300371408462524, + "learning_rate": 8.461981424729216e-06, + "loss": 0.6715, + "step": 22692 + }, + { + "epoch": 0.3336894469225545, + "grad_norm": 1.0345282554626465, + "learning_rate": 8.399094185495725e-06, + "loss": 0.6814, + "step": 22723 + }, + { + "epoch": 0.334144684912899, + "grad_norm": 1.008501648902893, + "learning_rate": 8.336394285228017e-06, + "loss": 0.6874, + "step": 22754 + }, + { + "epoch": 0.3345999229032436, + "grad_norm": 1.2125533819198608, + "learning_rate": 8.273882431485952e-06, + "loss": 0.6822, + "step": 22785 + }, + { + "epoch": 0.3350551608935881, + "grad_norm": 1.0801039934158325, + "learning_rate": 8.211559329707316e-06, + "loss": 0.6934, + "step": 22816 + }, + { + "epoch": 0.3355103988839327, + "grad_norm": 1.054120659828186, + "learning_rate": 8.149425683199823e-06, + "loss": 0.673, + "step": 22847 + }, + { + "epoch": 0.3359656368742772, + "grad_norm": 1.0891995429992676, + "learning_rate": 8.08748219313325e-06, + "loss": 0.6852, + "step": 22878 + }, + { + "epoch": 0.33642087486462174, + "grad_norm": 1.0153359174728394, + "learning_rate": 8.025729558531453e-06, + "loss": 0.6838, + "step": 22909 + }, + { + "epoch": 0.3368761128549663, + "grad_norm": 1.0803651809692383, + "learning_rate": 7.964168476264508e-06, + "loss": 0.6832, + "step": 22940 + }, + { + "epoch": 0.33733135084531085, + "grad_norm": 1.0524797439575195, + "learning_rate": 7.902799641040884e-06, + "loss": 0.6818, + "step": 22971 + }, + { + "epoch": 0.33778658883565543, + "grad_norm": 1.1119606494903564, + "learning_rate": 7.841623745399523e-06, + "loss": 0.6853, + "step": 23002 + }, + { + "epoch": 0.33824182682599996, + "grad_norm": 0.9859051704406738, + "learning_rate": 7.780641479702114e-06, + "loss": 0.6865, + "step": 23033 + }, + { + "epoch": 0.33869706481634454, + "grad_norm": 0.9103122353553772, + "learning_rate": 7.719853532125227e-06, + "loss": 0.6766, + "step": 23064 + }, + { + "epoch": 0.33915230280668907, + "grad_norm": 1.023934245109558, + "learning_rate": 7.65926058865258e-06, + "loss": 0.6781, + "step": 23095 + }, + { + "epoch": 0.3396075407970336, + "grad_norm": 1.0698734521865845, + "learning_rate": 7.598863333067313e-06, + "loss": 0.6848, + "step": 23126 + }, + { + "epoch": 0.3400627787873782, + "grad_norm": 1.1564706563949585, + "learning_rate": 7.538662446944253e-06, + "loss": 0.6868, + "step": 23157 + }, + { + "epoch": 0.3405180167777227, + "grad_norm": 1.114890456199646, + "learning_rate": 7.478658609642211e-06, + "loss": 0.6785, + "step": 23188 + }, + { + "epoch": 0.3409732547680673, + "grad_norm": 1.1149795055389404, + "learning_rate": 7.418852498296327e-06, + "loss": 0.6706, + "step": 23219 + }, + { + "epoch": 0.3414284927584118, + "grad_norm": 1.0689568519592285, + "learning_rate": 7.359244787810457e-06, + "loss": 0.6751, + "step": 23250 + }, + { + "epoch": 0.34188373074875633, + "grad_norm": 1.1450313329696655, + "learning_rate": 7.299836150849493e-06, + "loss": 0.6708, + "step": 23281 + }, + { + "epoch": 0.3423389687391009, + "grad_norm": 1.1399791240692139, + "learning_rate": 7.240627257831847e-06, + "loss": 0.6781, + "step": 23312 + }, + { + "epoch": 0.34279420672944544, + "grad_norm": 1.131117343902588, + "learning_rate": 7.1816187769218195e-06, + "loss": 0.6767, + "step": 23343 + }, + { + "epoch": 0.34324944471979, + "grad_norm": 1.0069117546081543, + "learning_rate": 7.1228113740220895e-06, + "loss": 0.6795, + "step": 23374 + }, + { + "epoch": 0.34370468271013455, + "grad_norm": 1.0576585531234741, + "learning_rate": 7.064205712766226e-06, + "loss": 0.685, + "step": 23405 + }, + { + "epoch": 0.3441599207004791, + "grad_norm": 1.1254713535308838, + "learning_rate": 7.005802454511129e-06, + "loss": 0.6772, + "step": 23436 + }, + { + "epoch": 0.34461515869082365, + "grad_norm": 1.032549262046814, + "learning_rate": 6.947602258329639e-06, + "loss": 0.6777, + "step": 23467 + }, + { + "epoch": 0.3450703966811682, + "grad_norm": 1.019902229309082, + "learning_rate": 6.889605781003078e-06, + "loss": 0.6814, + "step": 23498 + }, + { + "epoch": 0.34552563467151276, + "grad_norm": 1.0798234939575195, + "learning_rate": 6.831813677013776e-06, + "loss": 0.671, + "step": 23529 + }, + { + "epoch": 0.3459808726618573, + "grad_norm": 1.0329113006591797, + "learning_rate": 6.774226598537792e-06, + "loss": 0.6767, + "step": 23560 + }, + { + "epoch": 0.34643611065220187, + "grad_norm": 1.0345433950424194, + "learning_rate": 6.716845195437482e-06, + "loss": 0.6739, + "step": 23591 + }, + { + "epoch": 0.3468913486425464, + "grad_norm": 0.9708922505378723, + "learning_rate": 6.659670115254168e-06, + "loss": 0.6793, + "step": 23622 + }, + { + "epoch": 0.3473465866328909, + "grad_norm": 1.035715103149414, + "learning_rate": 6.602702003200872e-06, + "loss": 0.6847, + "step": 23653 + }, + { + "epoch": 0.3478018246232355, + "grad_norm": 0.9764015078544617, + "learning_rate": 6.545941502154992e-06, + "loss": 0.6815, + "step": 23684 + }, + { + "epoch": 0.34825706261358, + "grad_norm": 1.033187747001648, + "learning_rate": 6.489389252651057e-06, + "loss": 0.6715, + "step": 23715 + }, + { + "epoch": 0.3487123006039246, + "grad_norm": 1.0181931257247925, + "learning_rate": 6.4330458928735325e-06, + "loss": 0.6758, + "step": 23746 + }, + { + "epoch": 0.34916753859426913, + "grad_norm": 1.0105568170547485, + "learning_rate": 6.376912058649559e-06, + "loss": 0.6746, + "step": 23777 + }, + { + "epoch": 0.3496227765846137, + "grad_norm": 1.0559037923812866, + "learning_rate": 6.320988383441845e-06, + "loss": 0.6795, + "step": 23808 + }, + { + "epoch": 0.35007801457495824, + "grad_norm": 1.0579489469528198, + "learning_rate": 6.265275498341452e-06, + "loss": 0.6812, + "step": 23839 + }, + { + "epoch": 0.35053325256530277, + "grad_norm": 1.0675939321517944, + "learning_rate": 6.209774032060714e-06, + "loss": 0.6769, + "step": 23870 + }, + { + "epoch": 0.35098849055564735, + "grad_norm": 0.8914999961853027, + "learning_rate": 6.1544846109261365e-06, + "loss": 0.6825, + "step": 23901 + }, + { + "epoch": 0.3514437285459919, + "grad_norm": 0.9489038586616516, + "learning_rate": 6.099407858871342e-06, + "loss": 0.6801, + "step": 23932 + }, + { + "epoch": 0.35189896653633645, + "grad_norm": 1.040340781211853, + "learning_rate": 6.044544397429958e-06, + "loss": 0.6813, + "step": 23963 + }, + { + "epoch": 0.352354204526681, + "grad_norm": 1.2235099077224731, + "learning_rate": 5.989894845728708e-06, + "loss": 0.6742, + "step": 23994 + }, + { + "epoch": 0.3528094425170255, + "grad_norm": 1.0165103673934937, + "learning_rate": 5.9354598204803605e-06, + "loss": 0.6768, + "step": 24025 + }, + { + "epoch": 0.3532646805073701, + "grad_norm": 1.023253321647644, + "learning_rate": 5.881239935976762e-06, + "loss": 0.678, + "step": 24056 + }, + { + "epoch": 0.3537199184977146, + "grad_norm": 1.0616728067398071, + "learning_rate": 5.827235804081954e-06, + "loss": 0.6723, + "step": 24087 + }, + { + "epoch": 0.3541751564880592, + "grad_norm": 1.021304726600647, + "learning_rate": 5.773448034225221e-06, + "loss": 0.677, + "step": 24118 + }, + { + "epoch": 0.3546303944784037, + "grad_norm": 1.055176854133606, + "learning_rate": 5.719877233394228e-06, + "loss": 0.6779, + "step": 24149 + }, + { + "epoch": 0.3550856324687483, + "grad_norm": 1.0145021677017212, + "learning_rate": 5.666524006128191e-06, + "loss": 0.679, + "step": 24180 + }, + { + "epoch": 0.35554087045909283, + "grad_norm": 1.0869520902633667, + "learning_rate": 5.613388954511015e-06, + "loss": 0.6728, + "step": 24211 + }, + { + "epoch": 0.35599610844943735, + "grad_norm": 0.9280300140380859, + "learning_rate": 5.560472678164552e-06, + "loss": 0.6794, + "step": 24242 + }, + { + "epoch": 0.35645134643978194, + "grad_norm": 0.9949683547019958, + "learning_rate": 5.507775774241775e-06, + "loss": 0.6771, + "step": 24273 + }, + { + "epoch": 0.35690658443012646, + "grad_norm": 1.0003410577774048, + "learning_rate": 5.4552988374200945e-06, + "loss": 0.6789, + "step": 24304 + }, + { + "epoch": 0.35736182242047104, + "grad_norm": 1.0053478479385376, + "learning_rate": 5.403042459894597e-06, + "loss": 0.6783, + "step": 24335 + }, + { + "epoch": 0.35781706041081557, + "grad_norm": 0.992439866065979, + "learning_rate": 5.3510072313714135e-06, + "loss": 0.6734, + "step": 24366 + }, + { + "epoch": 0.35827229840116015, + "grad_norm": 1.0443801879882812, + "learning_rate": 5.2991937390610205e-06, + "loss": 0.6833, + "step": 24397 + }, + { + "epoch": 0.3587275363915047, + "grad_norm": 0.9766219854354858, + "learning_rate": 5.247602567671625e-06, + "loss": 0.6731, + "step": 24428 + }, + { + "epoch": 0.3591827743818492, + "grad_norm": 1.0202499628067017, + "learning_rate": 5.196234299402603e-06, + "loss": 0.6801, + "step": 24459 + }, + { + "epoch": 0.3596380123721938, + "grad_norm": 1.0573033094406128, + "learning_rate": 5.145089513937865e-06, + "loss": 0.6759, + "step": 24490 + }, + { + "epoch": 0.3600932503625383, + "grad_norm": 1.0492045879364014, + "learning_rate": 5.094168788439369e-06, + "loss": 0.6805, + "step": 24521 + }, + { + "epoch": 0.3605484883528829, + "grad_norm": 0.9806486964225769, + "learning_rate": 5.043472697540594e-06, + "loss": 0.6809, + "step": 24552 + }, + { + "epoch": 0.3610037263432274, + "grad_norm": 1.0643500089645386, + "learning_rate": 4.993001813340012e-06, + "loss": 0.6749, + "step": 24583 + }, + { + "epoch": 0.36145896433357194, + "grad_norm": 1.0298391580581665, + "learning_rate": 4.942756705394702e-06, + "loss": 0.6692, + "step": 24614 + }, + { + "epoch": 0.3619142023239165, + "grad_norm": 1.0897893905639648, + "learning_rate": 4.892737940713884e-06, + "loss": 0.6843, + "step": 24645 + }, + { + "epoch": 0.36236944031426105, + "grad_norm": 1.016927719116211, + "learning_rate": 4.842946083752511e-06, + "loss": 0.6789, + "step": 24676 + }, + { + "epoch": 0.36282467830460563, + "grad_norm": 1.0338464975357056, + "learning_rate": 4.79338169640493e-06, + "loss": 0.6693, + "step": 24707 + }, + { + "epoch": 0.36327991629495016, + "grad_norm": 1.0217540264129639, + "learning_rate": 4.74404533799851e-06, + "loss": 0.6734, + "step": 24738 + }, + { + "epoch": 0.36373515428529474, + "grad_norm": 0.973475992679596, + "learning_rate": 4.694937565287344e-06, + "loss": 0.6716, + "step": 24769 + }, + { + "epoch": 0.36419039227563926, + "grad_norm": 1.0647684335708618, + "learning_rate": 4.646058932445985e-06, + "loss": 0.6731, + "step": 24800 + }, + { + "epoch": 0.3646456302659838, + "grad_norm": 1.0267359018325806, + "learning_rate": 4.597409991063148e-06, + "loss": 0.673, + "step": 24831 + }, + { + "epoch": 0.36510086825632837, + "grad_norm": 1.150754451751709, + "learning_rate": 4.5489912901355375e-06, + "loss": 0.6818, + "step": 24862 + }, + { + "epoch": 0.3655561062466729, + "grad_norm": 1.061081886291504, + "learning_rate": 4.500803376061608e-06, + "loss": 0.6804, + "step": 24893 + }, + { + "epoch": 0.3660113442370175, + "grad_norm": 1.08760404586792, + "learning_rate": 4.45284679263541e-06, + "loss": 0.667, + "step": 24924 + }, + { + "epoch": 0.366466582227362, + "grad_norm": 1.0349477529525757, + "learning_rate": 4.4051220810404775e-06, + "loss": 0.6712, + "step": 24955 + }, + { + "epoch": 0.36692182021770653, + "grad_norm": 1.015647053718567, + "learning_rate": 4.3576297798437025e-06, + "loss": 0.6736, + "step": 24986 + }, + { + "epoch": 0.3673770582080511, + "grad_norm": 1.0646811723709106, + "learning_rate": 4.3103704249892436e-06, + "loss": 0.6758, + "step": 25017 + }, + { + "epoch": 0.36783229619839564, + "grad_norm": 1.0361759662628174, + "learning_rate": 4.263344549792487e-06, + "loss": 0.6768, + "step": 25048 + }, + { + "epoch": 0.3682875341887402, + "grad_norm": 0.9110644459724426, + "learning_rate": 4.216552684934056e-06, + "loss": 0.6826, + "step": 25079 + }, + { + "epoch": 0.36874277217908474, + "grad_norm": 0.9504536390304565, + "learning_rate": 4.169995358453777e-06, + "loss": 0.6848, + "step": 25110 + }, + { + "epoch": 0.3691980101694293, + "grad_norm": 1.049663782119751, + "learning_rate": 4.123673095744757e-06, + "loss": 0.6786, + "step": 25141 + }, + { + "epoch": 0.36965324815977385, + "grad_norm": 1.0623620748519897, + "learning_rate": 4.077586419547435e-06, + "loss": 0.6834, + "step": 25172 + }, + { + "epoch": 0.3701084861501184, + "grad_norm": 1.0130892992019653, + "learning_rate": 4.03173584994368e-06, + "loss": 0.6847, + "step": 25203 + }, + { + "epoch": 0.37056372414046296, + "grad_norm": 1.0924913883209229, + "learning_rate": 3.986121904350948e-06, + "loss": 0.6789, + "step": 25234 + }, + { + "epoch": 0.3710189621308075, + "grad_norm": 1.0095348358154297, + "learning_rate": 3.940745097516407e-06, + "loss": 0.6772, + "step": 25265 + }, + { + "epoch": 0.37147420012115207, + "grad_norm": 1.0151805877685547, + "learning_rate": 3.89560594151116e-06, + "loss": 0.6835, + "step": 25296 + }, + { + "epoch": 0.3719294381114966, + "grad_norm": 1.031278133392334, + "learning_rate": 3.850704945724456e-06, + "loss": 0.6775, + "step": 25327 + }, + { + "epoch": 0.3723846761018411, + "grad_norm": 0.9886388778686523, + "learning_rate": 3.8060426168579077e-06, + "loss": 0.6833, + "step": 25358 + }, + { + "epoch": 0.3728399140921857, + "grad_norm": 1.0071016550064087, + "learning_rate": 3.7616194589198407e-06, + "loss": 0.6763, + "step": 25389 + }, + { + "epoch": 0.3732951520825302, + "grad_norm": 0.986391007900238, + "learning_rate": 3.7174359732195574e-06, + "loss": 0.6753, + "step": 25420 + }, + { + "epoch": 0.3737503900728748, + "grad_norm": 1.0014315843582153, + "learning_rate": 3.673492658361677e-06, + "loss": 0.6811, + "step": 25451 + }, + { + "epoch": 0.37420562806321933, + "grad_norm": 1.0151642560958862, + "learning_rate": 3.6297900102405467e-06, + "loss": 0.6699, + "step": 25482 + }, + { + "epoch": 0.3746608660535639, + "grad_norm": 1.1334686279296875, + "learning_rate": 3.586328522034607e-06, + "loss": 0.6692, + "step": 25513 + }, + { + "epoch": 0.37511610404390844, + "grad_norm": 1.0605874061584473, + "learning_rate": 3.543108684200838e-06, + "loss": 0.6792, + "step": 25544 + }, + { + "epoch": 0.37557134203425296, + "grad_norm": 0.9361720681190491, + "learning_rate": 3.5001309844692464e-06, + "loss": 0.6826, + "step": 25575 + }, + { + "epoch": 0.37602658002459755, + "grad_norm": 1.131210446357727, + "learning_rate": 3.4573959078373215e-06, + "loss": 0.6725, + "step": 25606 + }, + { + "epoch": 0.37648181801494207, + "grad_norm": 1.065705418586731, + "learning_rate": 3.4149039365646063e-06, + "loss": 0.671, + "step": 25637 + }, + { + "epoch": 0.37693705600528665, + "grad_norm": 1.0600756406784058, + "learning_rate": 3.3726555501672143e-06, + "loss": 0.682, + "step": 25668 + }, + { + "epoch": 0.3773922939956312, + "grad_norm": 0.9861871004104614, + "learning_rate": 3.33065122541244e-06, + "loss": 0.6719, + "step": 25699 + }, + { + "epoch": 0.37784753198597576, + "grad_norm": 0.980603039264679, + "learning_rate": 3.288891436313385e-06, + "loss": 0.6784, + "step": 25730 + }, + { + "epoch": 0.3783027699763203, + "grad_norm": 1.0262209177017212, + "learning_rate": 3.2473766541235963e-06, + "loss": 0.6745, + "step": 25761 + }, + { + "epoch": 0.3787580079666648, + "grad_norm": 1.1122559309005737, + "learning_rate": 3.2061073473317466e-06, + "loss": 0.6851, + "step": 25792 + }, + { + "epoch": 0.3792132459570094, + "grad_norm": 0.986608624458313, + "learning_rate": 3.1650839816563444e-06, + "loss": 0.6794, + "step": 25823 + }, + { + "epoch": 0.3796684839473539, + "grad_norm": 1.1190105676651, + "learning_rate": 3.1243070200405093e-06, + "loss": 0.6706, + "step": 25854 + }, + { + "epoch": 0.3801237219376985, + "grad_norm": 0.9959911108016968, + "learning_rate": 3.0837769226467e-06, + "loss": 0.6809, + "step": 25885 + }, + { + "epoch": 0.380578959928043, + "grad_norm": 1.090646505355835, + "learning_rate": 3.0434941468515666e-06, + "loss": 0.677, + "step": 25916 + }, + { + "epoch": 0.38103419791838755, + "grad_norm": 1.1067842245101929, + "learning_rate": 3.003459147240753e-06, + "loss": 0.6751, + "step": 25947 + }, + { + "epoch": 0.38148943590873213, + "grad_norm": 1.0478054285049438, + "learning_rate": 2.9636723756037875e-06, + "loss": 0.6733, + "step": 25978 + }, + { + "epoch": 0.38194467389907666, + "grad_norm": 1.128760576248169, + "learning_rate": 2.9241342809289833e-06, + "loss": 0.6697, + "step": 26009 + }, + { + "epoch": 0.38239991188942124, + "grad_norm": 0.9529085755348206, + "learning_rate": 2.8848453093983594e-06, + "loss": 0.6713, + "step": 26040 + }, + { + "epoch": 0.38285514987976577, + "grad_norm": 1.1507978439331055, + "learning_rate": 2.8458059043826257e-06, + "loss": 0.6737, + "step": 26071 + }, + { + "epoch": 0.38331038787011035, + "grad_norm": 1.010461688041687, + "learning_rate": 2.807016506436172e-06, + "loss": 0.6759, + "step": 26102 + }, + { + "epoch": 0.3837656258604549, + "grad_norm": 0.9800288081169128, + "learning_rate": 2.7684775532920566e-06, + "loss": 0.6766, + "step": 26133 + }, + { + "epoch": 0.3842208638507994, + "grad_norm": 1.0265660285949707, + "learning_rate": 2.7301894798571425e-06, + "loss": 0.6621, + "step": 26164 + }, + { + "epoch": 0.384676101841144, + "grad_norm": 0.9935845136642456, + "learning_rate": 2.6921527182071386e-06, + "loss": 0.6806, + "step": 26195 + }, + { + "epoch": 0.3851313398314885, + "grad_norm": 0.9483484625816345, + "learning_rate": 2.654367697581725e-06, + "loss": 0.6701, + "step": 26226 + }, + { + "epoch": 0.3855865778218331, + "grad_norm": 0.9978191256523132, + "learning_rate": 2.6168348443797175e-06, + "loss": 0.6777, + "step": 26257 + }, + { + "epoch": 0.3860418158121776, + "grad_norm": 1.0558881759643555, + "learning_rate": 2.5795545821542757e-06, + "loss": 0.6717, + "step": 26288 + }, + { + "epoch": 0.38649705380252214, + "grad_norm": 0.9377934336662292, + "learning_rate": 2.54252733160808e-06, + "loss": 0.6664, + "step": 26319 + }, + { + "epoch": 0.3869522917928667, + "grad_norm": 1.0415966510772705, + "learning_rate": 2.5057535105886294e-06, + "loss": 0.6739, + "step": 26350 + }, + { + "epoch": 0.38740752978321125, + "grad_norm": 1.0703872442245483, + "learning_rate": 2.4692335340834953e-06, + "loss": 0.6739, + "step": 26381 + }, + { + "epoch": 0.3878627677735558, + "grad_norm": 1.0336194038391113, + "learning_rate": 2.432967814215639e-06, + "loss": 0.6802, + "step": 26412 + }, + { + "epoch": 0.38831800576390035, + "grad_norm": 1.1104646921157837, + "learning_rate": 2.396956760238794e-06, + "loss": 0.6778, + "step": 26443 + }, + { + "epoch": 0.38877324375424493, + "grad_norm": 0.9827953577041626, + "learning_rate": 2.361200778532796e-06, + "loss": 0.6739, + "step": 26474 + }, + { + "epoch": 0.38922848174458946, + "grad_norm": 1.060691237449646, + "learning_rate": 2.325700272599049e-06, + "loss": 0.6784, + "step": 26505 + }, + { + "epoch": 0.389683719734934, + "grad_norm": 1.0089689493179321, + "learning_rate": 2.2904556430559415e-06, + "loss": 0.6738, + "step": 26536 + }, + { + "epoch": 0.39013895772527857, + "grad_norm": 0.9602312445640564, + "learning_rate": 2.2554672876343106e-06, + "loss": 0.668, + "step": 26567 + }, + { + "epoch": 0.3905941957156231, + "grad_norm": 1.0028860569000244, + "learning_rate": 2.220735601173002e-06, + "loss": 0.6746, + "step": 26598 + }, + { + "epoch": 0.3910494337059677, + "grad_norm": 0.9555149674415588, + "learning_rate": 2.186260975614382e-06, + "loss": 0.6686, + "step": 26629 + }, + { + "epoch": 0.3915046716963122, + "grad_norm": 0.9502089619636536, + "learning_rate": 2.1520437999999034e-06, + "loss": 0.6789, + "step": 26660 + }, + { + "epoch": 0.3919599096866567, + "grad_norm": 0.9777249693870544, + "learning_rate": 2.1180844604657526e-06, + "loss": 0.6732, + "step": 26691 + }, + { + "epoch": 0.3924151476770013, + "grad_norm": 1.0464751720428467, + "learning_rate": 2.084383340238455e-06, + "loss": 0.6809, + "step": 26722 + }, + { + "epoch": 0.39287038566734583, + "grad_norm": 0.981984555721283, + "learning_rate": 2.0509408196305704e-06, + "loss": 0.6839, + "step": 26753 + }, + { + "epoch": 0.3933256236576904, + "grad_norm": 1.3059405088424683, + "learning_rate": 2.017757276036403e-06, + "loss": 0.6841, + "step": 26784 + }, + { + "epoch": 0.39378086164803494, + "grad_norm": 0.9858521223068237, + "learning_rate": 1.984833083927726e-06, + "loss": 0.6747, + "step": 26815 + }, + { + "epoch": 0.3942360996383795, + "grad_norm": 0.9905960559844971, + "learning_rate": 1.952168614849581e-06, + "loss": 0.6706, + "step": 26846 + }, + { + "epoch": 0.39469133762872405, + "grad_norm": 1.011273980140686, + "learning_rate": 1.919764237416058e-06, + "loss": 0.6633, + "step": 26877 + }, + { + "epoch": 0.3951465756190686, + "grad_norm": 0.9586290717124939, + "learning_rate": 1.8876203173061463e-06, + "loss": 0.6758, + "step": 26908 + }, + { + "epoch": 0.39560181360941316, + "grad_norm": 1.0454294681549072, + "learning_rate": 1.8557372172596206e-06, + "loss": 0.6795, + "step": 26939 + }, + { + "epoch": 0.3960570515997577, + "grad_norm": 0.969967246055603, + "learning_rate": 1.8241152970729341e-06, + "loss": 0.6777, + "step": 26970 + }, + { + "epoch": 0.39651228959010226, + "grad_norm": 1.0945801734924316, + "learning_rate": 1.7927549135951572e-06, + "loss": 0.6695, + "step": 27001 + }, + { + "epoch": 0.3969675275804468, + "grad_norm": 1.036529541015625, + "learning_rate": 1.7616564207239477e-06, + "loss": 0.6702, + "step": 27032 + }, + { + "epoch": 0.39742276557079137, + "grad_norm": 1.02977454662323, + "learning_rate": 1.730820169401584e-06, + "loss": 0.6773, + "step": 27063 + }, + { + "epoch": 0.3978780035611359, + "grad_norm": 1.1005606651306152, + "learning_rate": 1.7002465076109558e-06, + "loss": 0.6787, + "step": 27094 + }, + { + "epoch": 0.3983332415514804, + "grad_norm": 1.0047553777694702, + "learning_rate": 1.6699357803716898e-06, + "loss": 0.6787, + "step": 27125 + }, + { + "epoch": 0.398788479541825, + "grad_norm": 1.0358285903930664, + "learning_rate": 1.6398883297362305e-06, + "loss": 0.6747, + "step": 27156 + }, + { + "epoch": 0.39924371753216953, + "grad_norm": 0.9317636489868164, + "learning_rate": 1.6101044947859606e-06, + "loss": 0.6675, + "step": 27187 + }, + { + "epoch": 0.3996989555225141, + "grad_norm": 1.0691051483154297, + "learning_rate": 1.5805846116274114e-06, + "loss": 0.6726, + "step": 27218 + }, + { + "epoch": 0.40015419351285864, + "grad_norm": 1.053666114807129, + "learning_rate": 1.5513290133884611e-06, + "loss": 0.6766, + "step": 27249 + }, + { + "epoch": 0.40060943150320316, + "grad_norm": 1.0425807237625122, + "learning_rate": 1.5223380302145512e-06, + "loss": 0.6673, + "step": 27280 + }, + { + "epoch": 0.40106466949354774, + "grad_norm": 0.9612600207328796, + "learning_rate": 1.4936119892649925e-06, + "loss": 0.6768, + "step": 27311 + }, + { + "epoch": 0.40151990748389227, + "grad_norm": 0.9574009776115417, + "learning_rate": 1.4651512147092482e-06, + "loss": 0.6678, + "step": 27342 + }, + { + "epoch": 0.40197514547423685, + "grad_norm": 0.9669239521026611, + "learning_rate": 1.4369560277232908e-06, + "loss": 0.6805, + "step": 27373 + }, + { + "epoch": 0.4024303834645814, + "grad_norm": 1.0453078746795654, + "learning_rate": 1.409026746485978e-06, + "loss": 0.6714, + "step": 27404 + }, + { + "epoch": 0.40288562145492596, + "grad_norm": 1.1115361452102661, + "learning_rate": 1.3813636861754464e-06, + "loss": 0.671, + "step": 27435 + }, + { + "epoch": 0.4033408594452705, + "grad_norm": 1.0532453060150146, + "learning_rate": 1.3539671589655773e-06, + "loss": 0.6814, + "step": 27466 + } + ], + "logging_steps": 31, + "max_steps": 30517, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 3052, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.026490470961932e+19, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-27468/training_args.bin b/checkpoint-27468/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..dd0c30645908fd59ad9d6091c5e2e08018856b49 --- /dev/null +++ b/checkpoint-27468/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2d2ae676cb4770c8405bd6fc6962f1ef1289d669b20872c9a947682a2673db22 +size 5304 diff --git a/checkpoint-30517/config.json b/checkpoint-30517/config.json new file mode 100644 index 0000000000000000000000000000000000000000..1a2e378a7091e0c0c8eaa10d0bd7796835fe4b64 --- /dev/null +++ b/checkpoint-30517/config.json @@ -0,0 +1,39 @@ +{ + "_name_or_path": "meta-llama/Llama-3.1-8B-Instruct", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 8.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "torch_dtype": "float32", + "transformers_version": "4.44.0.dev0", + "use_cache": true, + "vocab_size": 128256 +} diff --git a/checkpoint-30517/generation_config.json b/checkpoint-30517/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c0d952a3697e90a24460d766a494a8bb9cf80534 --- /dev/null +++ b/checkpoint-30517/generation_config.json @@ -0,0 +1,12 @@ +{ + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.44.0.dev0" +} diff --git a/checkpoint-30517/model-00001-of-00007.safetensors b/checkpoint-30517/model-00001-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..e32d3fb1ab3b688aa85ce479a51c3b384c12e8c5 --- /dev/null +++ b/checkpoint-30517/model-00001-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:11a359b750f6ab7df9c886897f4c9a6e9809f7b3ff21fc7d4d310c85a52c98c5 +size 4886466168 diff --git a/checkpoint-30517/model-00002-of-00007.safetensors b/checkpoint-30517/model-00002-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..fbc8e97904ed5acdba6240901a52d7ea3c73628d --- /dev/null +++ b/checkpoint-30517/model-00002-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:23fa5aaf8ac56c17c821980c9aee44fcbc2a617fc570e875e6fdb93d92886c72 +size 4832007448 diff --git a/checkpoint-30517/model-00003-of-00007.safetensors b/checkpoint-30517/model-00003-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..aeab8b855338147792249baf9c680bfb29ea42a5 --- /dev/null +++ b/checkpoint-30517/model-00003-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a223e87240e50dfb05634ea909cf1ec715d37a12b681b14a3a6e97f28688f2b +size 4999813112 diff --git a/checkpoint-30517/model-00004-of-00007.safetensors b/checkpoint-30517/model-00004-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ac9e7b6b5edbe63eec2dd4c7d3310db1f10fdda9 --- /dev/null +++ b/checkpoint-30517/model-00004-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3047d199c611c6d12b1cca780f033ad89df9c076f742674f7ac4a892ca9573b3 +size 4999813128 diff --git a/checkpoint-30517/model-00005-of-00007.safetensors b/checkpoint-30517/model-00005-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..48718503a952853ffd5e231b9dba74b7026e0e01 --- /dev/null +++ b/checkpoint-30517/model-00005-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a2f91d749d664ecd22c71f3d54488966fc6899a03924b8fea3613c8f83398a8 +size 4832007496 diff --git a/checkpoint-30517/model-00006-of-00007.safetensors b/checkpoint-30517/model-00006-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..0f9d24d60f542bdc40d1b63728e4f4807c43a53e --- /dev/null +++ b/checkpoint-30517/model-00006-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:80d44e66e2e18efa285692398241c7f447826a9503eaa614a71d6417684a5ab2 +size 4999813120 diff --git a/checkpoint-30517/model-00007-of-00007.safetensors b/checkpoint-30517/model-00007-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..41844e746d4850e8773a61a7aebc7da8fb5ce539 --- /dev/null +++ b/checkpoint-30517/model-00007-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e5a3b99186448cc9d7508459f20329d6ec2ec7647dfcdb4c5e216066eadf5bb6 +size 2571158184 diff --git a/checkpoint-30517/model.safetensors.index.json b/checkpoint-30517/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..0985084320f65a678bea622edb50d3771af4fd13 --- /dev/null +++ b/checkpoint-30517/model.safetensors.index.json @@ -0,0 +1,298 @@ +{ + "metadata": { + "total_size": 32121044992 + }, + "weight_map": { + "lm_head.weight": "model-00007-of-00007.safetensors", + "model.embed_tokens.weight": "model-00001-of-00007.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.10.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.15.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.20.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.21.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.26.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.3.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.30.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.input_layernorm.weight": "model-00007-of-00007.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00007-of-00007.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00007-of-00007.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00007-of-00007.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.4.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.9.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.norm.weight": "model-00007-of-00007.safetensors" + } +} diff --git a/checkpoint-30517/optimizer.pt b/checkpoint-30517/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..424fbe9fa6fb4015d9f33e4d8472eeaf0ca9e961 --- /dev/null +++ b/checkpoint-30517/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c024b2d785aac3878a24757fc17dc04f388f8604e7814c1084ae56ab4591c72 +size 15385036334 diff --git a/checkpoint-30517/rng_state.pth b/checkpoint-30517/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..897ad19948758b40c785c678c136c4029433a27a --- /dev/null +++ b/checkpoint-30517/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d9cd6a0487226e5bd30d1846894c82af483733ab4381b75bae9c0745e05d405 +size 14244 diff --git a/checkpoint-30517/scheduler.pt b/checkpoint-30517/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..1d585b62f4a9bc8119a4f1d03cf2bb269e99411b --- /dev/null +++ b/checkpoint-30517/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d91c597306f26392e8da0d33a09fdcca77cfdeb5ad42248045521772fa3d64e +size 1064 diff --git a/checkpoint-30517/trainer_state.json b/checkpoint-30517/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..f9cf3888a0e0481f9d57ff558697f8c03ffa7302 --- /dev/null +++ b/checkpoint-30517/trainer_state.json @@ -0,0 +1,6921 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.44814508875305176, + "eval_steps": 500, + "global_step": 30517, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0004552379903445491, + "grad_norm": 4.0020222663879395, + "learning_rate": 1.0157273918741808e-06, + "loss": 0.92, + "step": 31 + }, + { + "epoch": 0.0009104759806890982, + "grad_norm": 3.06746506690979, + "learning_rate": 2.0314547837483616e-06, + "loss": 0.8898, + "step": 62 + }, + { + "epoch": 0.0013657139710336473, + "grad_norm": 3.004636764526367, + "learning_rate": 3.0471821756225426e-06, + "loss": 0.8653, + "step": 93 + }, + { + "epoch": 0.0018209519613781964, + "grad_norm": 3.0733370780944824, + "learning_rate": 4.062909567496723e-06, + "loss": 0.8516, + "step": 124 + }, + { + "epoch": 0.0022761899517227454, + "grad_norm": 2.898719549179077, + "learning_rate": 5.078636959370905e-06, + "loss": 0.8461, + "step": 155 + }, + { + "epoch": 0.0027314279420672946, + "grad_norm": 2.8273258209228516, + "learning_rate": 6.094364351245085e-06, + "loss": 0.8362, + "step": 186 + }, + { + "epoch": 0.0031866659324118435, + "grad_norm": 2.9983041286468506, + "learning_rate": 7.110091743119267e-06, + "loss": 0.8274, + "step": 217 + }, + { + "epoch": 0.0036419039227563927, + "grad_norm": 2.9561767578125, + "learning_rate": 8.125819134993446e-06, + "loss": 0.8373, + "step": 248 + }, + { + "epoch": 0.0040971419131009415, + "grad_norm": 2.732726812362671, + "learning_rate": 9.141546526867629e-06, + "loss": 0.8342, + "step": 279 + }, + { + "epoch": 0.004552379903445491, + "grad_norm": 2.8236358165740967, + "learning_rate": 1.015727391874181e-05, + "loss": 0.8387, + "step": 310 + }, + { + "epoch": 0.00500761789379004, + "grad_norm": 2.8730266094207764, + "learning_rate": 1.117300131061599e-05, + "loss": 0.8328, + "step": 341 + }, + { + "epoch": 0.005462855884134589, + "grad_norm": 2.717439651489258, + "learning_rate": 1.218872870249017e-05, + "loss": 0.8316, + "step": 372 + }, + { + "epoch": 0.005918093874479138, + "grad_norm": 2.9625961780548096, + "learning_rate": 1.3204456094364351e-05, + "loss": 0.8295, + "step": 403 + }, + { + "epoch": 0.006373331864823687, + "grad_norm": 2.6649422645568848, + "learning_rate": 1.4220183486238533e-05, + "loss": 0.8356, + "step": 434 + }, + { + "epoch": 0.006828569855168236, + "grad_norm": 2.9066264629364014, + "learning_rate": 1.5235910878112714e-05, + "loss": 0.8313, + "step": 465 + }, + { + "epoch": 0.0072838078455127854, + "grad_norm": 2.7710695266723633, + "learning_rate": 1.6251638269986893e-05, + "loss": 0.8251, + "step": 496 + }, + { + "epoch": 0.007739045835857335, + "grad_norm": 2.416724443435669, + "learning_rate": 1.7267365661861077e-05, + "loss": 0.8319, + "step": 527 + }, + { + "epoch": 0.008194283826201883, + "grad_norm": 2.583005905151367, + "learning_rate": 1.8283093053735257e-05, + "loss": 0.8265, + "step": 558 + }, + { + "epoch": 0.008649521816546433, + "grad_norm": 2.6601760387420654, + "learning_rate": 1.9298820445609438e-05, + "loss": 0.8328, + "step": 589 + }, + { + "epoch": 0.009104759806890982, + "grad_norm": 2.9271795749664307, + "learning_rate": 2.031454783748362e-05, + "loss": 0.8282, + "step": 620 + }, + { + "epoch": 0.00955999779723553, + "grad_norm": 2.3631091117858887, + "learning_rate": 2.13302752293578e-05, + "loss": 0.8314, + "step": 651 + }, + { + "epoch": 0.01001523578758008, + "grad_norm": 2.4002573490142822, + "learning_rate": 2.234600262123198e-05, + "loss": 0.829, + "step": 682 + }, + { + "epoch": 0.010470473777924628, + "grad_norm": 2.629352331161499, + "learning_rate": 2.336173001310616e-05, + "loss": 0.8315, + "step": 713 + }, + { + "epoch": 0.010925711768269179, + "grad_norm": 2.3604094982147217, + "learning_rate": 2.437745740498034e-05, + "loss": 0.8338, + "step": 744 + }, + { + "epoch": 0.011380949758613727, + "grad_norm": 2.8708078861236572, + "learning_rate": 2.5393184796854525e-05, + "loss": 0.843, + "step": 775 + }, + { + "epoch": 0.011836187748958275, + "grad_norm": 2.7403945922851562, + "learning_rate": 2.6408912188728702e-05, + "loss": 0.8372, + "step": 806 + }, + { + "epoch": 0.012291425739302825, + "grad_norm": 2.733816623687744, + "learning_rate": 2.7424639580602886e-05, + "loss": 0.8327, + "step": 837 + }, + { + "epoch": 0.012746663729647374, + "grad_norm": 2.2656960487365723, + "learning_rate": 2.8440366972477066e-05, + "loss": 0.8357, + "step": 868 + }, + { + "epoch": 0.013201901719991924, + "grad_norm": 2.18038010597229, + "learning_rate": 2.9456094364351244e-05, + "loss": 0.8331, + "step": 899 + }, + { + "epoch": 0.013657139710336472, + "grad_norm": 2.497558832168579, + "learning_rate": 3.0471821756225428e-05, + "loss": 0.8271, + "step": 930 + }, + { + "epoch": 0.01411237770068102, + "grad_norm": 2.322632074356079, + "learning_rate": 3.148754914809961e-05, + "loss": 0.8327, + "step": 961 + }, + { + "epoch": 0.014567615691025571, + "grad_norm": 2.5596141815185547, + "learning_rate": 3.2503276539973785e-05, + "loss": 0.8367, + "step": 992 + }, + { + "epoch": 0.01502285368137012, + "grad_norm": 2.1262409687042236, + "learning_rate": 3.351900393184797e-05, + "loss": 0.8192, + "step": 1023 + }, + { + "epoch": 0.01547809167171467, + "grad_norm": 2.2483584880828857, + "learning_rate": 3.453473132372215e-05, + "loss": 0.8352, + "step": 1054 + }, + { + "epoch": 0.015933329662059216, + "grad_norm": 2.224043130874634, + "learning_rate": 3.555045871559633e-05, + "loss": 0.8362, + "step": 1085 + }, + { + "epoch": 0.016388567652403766, + "grad_norm": 2.104788303375244, + "learning_rate": 3.6566186107470514e-05, + "loss": 0.832, + "step": 1116 + }, + { + "epoch": 0.016843805642748316, + "grad_norm": 2.126499652862549, + "learning_rate": 3.7581913499344695e-05, + "loss": 0.8341, + "step": 1147 + }, + { + "epoch": 0.017299043633092866, + "grad_norm": 2.221691370010376, + "learning_rate": 3.8597640891218876e-05, + "loss": 0.8346, + "step": 1178 + }, + { + "epoch": 0.017754281623437413, + "grad_norm": 2.4445159435272217, + "learning_rate": 3.9613368283093056e-05, + "loss": 0.8415, + "step": 1209 + }, + { + "epoch": 0.018209519613781963, + "grad_norm": 2.321516752243042, + "learning_rate": 4.062909567496724e-05, + "loss": 0.8289, + "step": 1240 + }, + { + "epoch": 0.018664757604126513, + "grad_norm": 2.2673656940460205, + "learning_rate": 4.164482306684142e-05, + "loss": 0.8307, + "step": 1271 + }, + { + "epoch": 0.01911999559447106, + "grad_norm": 2.1191513538360596, + "learning_rate": 4.26605504587156e-05, + "loss": 0.8319, + "step": 1302 + }, + { + "epoch": 0.01957523358481561, + "grad_norm": 2.18985652923584, + "learning_rate": 4.367627785058978e-05, + "loss": 0.8398, + "step": 1333 + }, + { + "epoch": 0.02003047157516016, + "grad_norm": 1.9449113607406616, + "learning_rate": 4.469200524246396e-05, + "loss": 0.8304, + "step": 1364 + }, + { + "epoch": 0.020485709565504707, + "grad_norm": 2.0119357109069824, + "learning_rate": 4.570773263433814e-05, + "loss": 0.82, + "step": 1395 + }, + { + "epoch": 0.020940947555849257, + "grad_norm": 2.0013489723205566, + "learning_rate": 4.672346002621232e-05, + "loss": 0.8275, + "step": 1426 + }, + { + "epoch": 0.021396185546193807, + "grad_norm": 2.0175704956054688, + "learning_rate": 4.77391874180865e-05, + "loss": 0.8348, + "step": 1457 + }, + { + "epoch": 0.021851423536538357, + "grad_norm": 2.0555551052093506, + "learning_rate": 4.875491480996068e-05, + "loss": 0.8322, + "step": 1488 + }, + { + "epoch": 0.022306661526882904, + "grad_norm": 1.9871079921722412, + "learning_rate": 4.977064220183487e-05, + "loss": 0.8253, + "step": 1519 + }, + { + "epoch": 0.022761899517227454, + "grad_norm": 2.1991310119628906, + "learning_rate": 4.9999915451558777e-05, + "loss": 0.8252, + "step": 1550 + }, + { + "epoch": 0.023217137507572004, + "grad_norm": 1.96790611743927, + "learning_rate": 4.999955597496219e-05, + "loss": 0.8242, + "step": 1581 + }, + { + "epoch": 0.02367237549791655, + "grad_norm": 2.0572164058685303, + "learning_rate": 4.9998914381774255e-05, + "loss": 0.833, + "step": 1612 + }, + { + "epoch": 0.0241276134882611, + "grad_norm": 2.0381035804748535, + "learning_rate": 4.999799067923527e-05, + "loss": 0.8351, + "step": 1643 + }, + { + "epoch": 0.02458285147860565, + "grad_norm": 2.145235061645508, + "learning_rate": 4.999678487776908e-05, + "loss": 0.842, + "step": 1674 + }, + { + "epoch": 0.025038089468950198, + "grad_norm": 2.0120856761932373, + "learning_rate": 4.9995296990983006e-05, + "loss": 0.838, + "step": 1705 + }, + { + "epoch": 0.025493327459294748, + "grad_norm": 1.9461947679519653, + "learning_rate": 4.999352703566763e-05, + "loss": 0.8269, + "step": 1736 + }, + { + "epoch": 0.025948565449639298, + "grad_norm": 1.9189060926437378, + "learning_rate": 4.999147503179668e-05, + "loss": 0.8214, + "step": 1767 + }, + { + "epoch": 0.026403803439983848, + "grad_norm": 1.906604290008545, + "learning_rate": 4.998914100252672e-05, + "loss": 0.8249, + "step": 1798 + }, + { + "epoch": 0.026859041430328395, + "grad_norm": 2.0810904502868652, + "learning_rate": 4.998652497419696e-05, + "loss": 0.8143, + "step": 1829 + }, + { + "epoch": 0.027314279420672945, + "grad_norm": 1.9607690572738647, + "learning_rate": 4.9983626976328927e-05, + "loss": 0.8155, + "step": 1860 + }, + { + "epoch": 0.027769517411017495, + "grad_norm": 1.8492732048034668, + "learning_rate": 4.998044704162613e-05, + "loss": 0.8168, + "step": 1891 + }, + { + "epoch": 0.02822475540136204, + "grad_norm": 2.0616824626922607, + "learning_rate": 4.9976985205973705e-05, + "loss": 0.8279, + "step": 1922 + }, + { + "epoch": 0.02867999339170659, + "grad_norm": 1.882591724395752, + "learning_rate": 4.997324150843799e-05, + "loss": 0.8199, + "step": 1953 + }, + { + "epoch": 0.029135231382051142, + "grad_norm": 1.8001904487609863, + "learning_rate": 4.99692159912661e-05, + "loss": 0.8232, + "step": 1984 + }, + { + "epoch": 0.02959046937239569, + "grad_norm": 1.94768226146698, + "learning_rate": 4.996490869988546e-05, + "loss": 0.817, + "step": 2015 + }, + { + "epoch": 0.03004570736274024, + "grad_norm": 1.845452904701233, + "learning_rate": 4.996031968290326e-05, + "loss": 0.8146, + "step": 2046 + }, + { + "epoch": 0.03050094535308479, + "grad_norm": 1.8649152517318726, + "learning_rate": 4.995544899210594e-05, + "loss": 0.8246, + "step": 2077 + }, + { + "epoch": 0.03095618334342934, + "grad_norm": 1.665781021118164, + "learning_rate": 4.9950296682458583e-05, + "loss": 0.8004, + "step": 2108 + }, + { + "epoch": 0.031411421333773885, + "grad_norm": 1.6481060981750488, + "learning_rate": 4.994486281210429e-05, + "loss": 0.7949, + "step": 2139 + }, + { + "epoch": 0.03186665932411843, + "grad_norm": 1.7481175661087036, + "learning_rate": 4.9939147442363566e-05, + "loss": 0.8142, + "step": 2170 + }, + { + "epoch": 0.032321897314462986, + "grad_norm": 1.6130069494247437, + "learning_rate": 4.9933150637733574e-05, + "loss": 0.8126, + "step": 2201 + }, + { + "epoch": 0.03277713530480753, + "grad_norm": 1.7536373138427734, + "learning_rate": 4.992687246588743e-05, + "loss": 0.8149, + "step": 2232 + }, + { + "epoch": 0.033232373295152086, + "grad_norm": 1.721200942993164, + "learning_rate": 4.992031299767347e-05, + "loss": 0.8051, + "step": 2263 + }, + { + "epoch": 0.03368761128549663, + "grad_norm": 3.3350446224212646, + "learning_rate": 4.9913472307114386e-05, + "loss": 0.8007, + "step": 2294 + }, + { + "epoch": 0.03414284927584118, + "grad_norm": 1.848281741142273, + "learning_rate": 4.9906350471406446e-05, + "loss": 0.8076, + "step": 2325 + }, + { + "epoch": 0.03459808726618573, + "grad_norm": 1.6283164024353027, + "learning_rate": 4.989894757091861e-05, + "loss": 0.8084, + "step": 2356 + }, + { + "epoch": 0.03505332525653028, + "grad_norm": 1.6567096710205078, + "learning_rate": 4.989126368919158e-05, + "loss": 0.8075, + "step": 2387 + }, + { + "epoch": 0.035508563246874826, + "grad_norm": 1.6488817930221558, + "learning_rate": 4.988329891293693e-05, + "loss": 0.7994, + "step": 2418 + }, + { + "epoch": 0.03596380123721938, + "grad_norm": 1.9465230703353882, + "learning_rate": 4.987505333203608e-05, + "loss": 0.8064, + "step": 2449 + }, + { + "epoch": 0.036419039227563926, + "grad_norm": 1.722517967224121, + "learning_rate": 4.9866527039539276e-05, + "loss": 0.8005, + "step": 2480 + }, + { + "epoch": 0.03687427721790847, + "grad_norm": 1.8509281873703003, + "learning_rate": 4.9857720131664594e-05, + "loss": 0.8094, + "step": 2511 + }, + { + "epoch": 0.03732951520825303, + "grad_norm": 1.7140249013900757, + "learning_rate": 4.9848632707796773e-05, + "loss": 0.8018, + "step": 2542 + }, + { + "epoch": 0.03778475319859757, + "grad_norm": 1.9315160512924194, + "learning_rate": 4.9839264870486155e-05, + "loss": 0.8094, + "step": 2573 + }, + { + "epoch": 0.03823999118894212, + "grad_norm": 1.728063702583313, + "learning_rate": 4.9829616725447526e-05, + "loss": 0.8052, + "step": 2604 + }, + { + "epoch": 0.03869522917928667, + "grad_norm": 1.588205099105835, + "learning_rate": 4.981968838155888e-05, + "loss": 0.7983, + "step": 2635 + }, + { + "epoch": 0.03915046716963122, + "grad_norm": 1.6086421012878418, + "learning_rate": 4.980947995086024e-05, + "loss": 0.7961, + "step": 2666 + }, + { + "epoch": 0.03960570515997577, + "grad_norm": 1.680635929107666, + "learning_rate": 4.979899154855234e-05, + "loss": 0.8033, + "step": 2697 + }, + { + "epoch": 0.04006094315032032, + "grad_norm": 1.6833465099334717, + "learning_rate": 4.9788223292995386e-05, + "loss": 0.793, + "step": 2728 + }, + { + "epoch": 0.04051618114066487, + "grad_norm": 1.5738922357559204, + "learning_rate": 4.977717530570768e-05, + "loss": 0.7837, + "step": 2759 + }, + { + "epoch": 0.040971419131009414, + "grad_norm": 1.6513015031814575, + "learning_rate": 4.976584771136425e-05, + "loss": 0.7969, + "step": 2790 + }, + { + "epoch": 0.04142665712135397, + "grad_norm": 1.7180182933807373, + "learning_rate": 4.975424063779547e-05, + "loss": 0.7994, + "step": 2821 + }, + { + "epoch": 0.041881895111698514, + "grad_norm": 1.5236577987670898, + "learning_rate": 4.974235421598557e-05, + "loss": 0.7952, + "step": 2852 + }, + { + "epoch": 0.04233713310204307, + "grad_norm": 1.760188341140747, + "learning_rate": 4.973018858007122e-05, + "loss": 0.795, + "step": 2883 + }, + { + "epoch": 0.042792371092387614, + "grad_norm": 1.5314075946807861, + "learning_rate": 4.9717743867339963e-05, + "loss": 0.7884, + "step": 2914 + }, + { + "epoch": 0.04324760908273216, + "grad_norm": 1.7256252765655518, + "learning_rate": 4.9705020218228695e-05, + "loss": 0.7883, + "step": 2945 + }, + { + "epoch": 0.043702847073076714, + "grad_norm": 1.6262825727462769, + "learning_rate": 4.969201777632205e-05, + "loss": 0.8007, + "step": 2976 + }, + { + "epoch": 0.04415808506342126, + "grad_norm": 1.4939130544662476, + "learning_rate": 4.9678736688350846e-05, + "loss": 0.7862, + "step": 3007 + }, + { + "epoch": 0.04461332305376581, + "grad_norm": 1.6776609420776367, + "learning_rate": 4.966517710419033e-05, + "loss": 0.7977, + "step": 3038 + }, + { + "epoch": 0.04506856104411036, + "grad_norm": 1.7425099611282349, + "learning_rate": 4.965133917685858e-05, + "loss": 0.789, + "step": 3069 + }, + { + "epoch": 0.04552379903445491, + "grad_norm": 1.5114233493804932, + "learning_rate": 4.9637223062514714e-05, + "loss": 0.7915, + "step": 3100 + }, + { + "epoch": 0.045979037024799455, + "grad_norm": 1.6397335529327393, + "learning_rate": 4.962282892045718e-05, + "loss": 0.8127, + "step": 3131 + }, + { + "epoch": 0.04643427501514401, + "grad_norm": 1.7266038656234741, + "learning_rate": 4.9608156913121904e-05, + "loss": 0.827, + "step": 3162 + }, + { + "epoch": 0.046889513005488555, + "grad_norm": 1.5216429233551025, + "learning_rate": 4.959320720608049e-05, + "loss": 0.8146, + "step": 3193 + }, + { + "epoch": 0.0473447509958331, + "grad_norm": 1.9235813617706299, + "learning_rate": 4.9577979968038354e-05, + "loss": 0.8118, + "step": 3224 + }, + { + "epoch": 0.047799988986177655, + "grad_norm": 1.6768368482589722, + "learning_rate": 4.956247537083282e-05, + "loss": 0.7925, + "step": 3255 + }, + { + "epoch": 0.0482552269765222, + "grad_norm": 1.6357700824737549, + "learning_rate": 4.9546693589431145e-05, + "loss": 0.7743, + "step": 3286 + }, + { + "epoch": 0.04871046496686675, + "grad_norm": 1.6823676824569702, + "learning_rate": 4.9530634801928595e-05, + "loss": 0.7949, + "step": 3317 + }, + { + "epoch": 0.0491657029572113, + "grad_norm": 1.4716497659683228, + "learning_rate": 4.9514299189546395e-05, + "loss": 0.7755, + "step": 3348 + }, + { + "epoch": 0.04962094094755585, + "grad_norm": 3.1948273181915283, + "learning_rate": 4.949768693662973e-05, + "loss": 0.7867, + "step": 3379 + }, + { + "epoch": 0.050076178937900395, + "grad_norm": 1.7412703037261963, + "learning_rate": 4.948079823064559e-05, + "loss": 0.788, + "step": 3410 + }, + { + "epoch": 0.05053141692824495, + "grad_norm": 1.5544873476028442, + "learning_rate": 4.946363326218074e-05, + "loss": 0.7752, + "step": 3441 + }, + { + "epoch": 0.050986654918589495, + "grad_norm": 1.5616456270217896, + "learning_rate": 4.9446192224939525e-05, + "loss": 0.7752, + "step": 3472 + }, + { + "epoch": 0.05144189290893404, + "grad_norm": 1.4451948404312134, + "learning_rate": 4.942847531574167e-05, + "loss": 0.7786, + "step": 3503 + }, + { + "epoch": 0.051897130899278596, + "grad_norm": 1.5473634004592896, + "learning_rate": 4.941048273452008e-05, + "loss": 0.7824, + "step": 3534 + }, + { + "epoch": 0.05235236888962314, + "grad_norm": 1.4772653579711914, + "learning_rate": 4.9392214684318605e-05, + "loss": 0.7873, + "step": 3565 + }, + { + "epoch": 0.052807606879967696, + "grad_norm": 1.4708329439163208, + "learning_rate": 4.93736713712897e-05, + "loss": 0.7851, + "step": 3596 + }, + { + "epoch": 0.05326284487031224, + "grad_norm": 1.469369649887085, + "learning_rate": 4.9354853004692124e-05, + "loss": 0.7874, + "step": 3627 + }, + { + "epoch": 0.05371808286065679, + "grad_norm": 1.3600234985351562, + "learning_rate": 4.93357597968886e-05, + "loss": 0.7792, + "step": 3658 + }, + { + "epoch": 0.05417332085100134, + "grad_norm": 1.6479856967926025, + "learning_rate": 4.931639196334338e-05, + "loss": 0.7886, + "step": 3689 + }, + { + "epoch": 0.05462855884134589, + "grad_norm": 1.5576198101043701, + "learning_rate": 4.9296749722619826e-05, + "loss": 0.7873, + "step": 3720 + }, + { + "epoch": 0.055083796831690436, + "grad_norm": 1.5949997901916504, + "learning_rate": 4.9276833296377966e-05, + "loss": 0.7794, + "step": 3751 + }, + { + "epoch": 0.05553903482203499, + "grad_norm": 1.3033227920532227, + "learning_rate": 4.925664290937196e-05, + "loss": 0.7763, + "step": 3782 + }, + { + "epoch": 0.055994272812379536, + "grad_norm": 1.5083190202713013, + "learning_rate": 4.9236178789447576e-05, + "loss": 0.7796, + "step": 3813 + }, + { + "epoch": 0.05644951080272408, + "grad_norm": 1.4691548347473145, + "learning_rate": 4.921544116753962e-05, + "loss": 0.7744, + "step": 3844 + }, + { + "epoch": 0.05690474879306864, + "grad_norm": 1.4447206258773804, + "learning_rate": 4.919443027766935e-05, + "loss": 0.7812, + "step": 3875 + }, + { + "epoch": 0.05735998678341318, + "grad_norm": 1.4616878032684326, + "learning_rate": 4.91731463569418e-05, + "loss": 0.7824, + "step": 3906 + }, + { + "epoch": 0.05781522477375773, + "grad_norm": 1.4808290004730225, + "learning_rate": 4.915158964554312e-05, + "loss": 0.7777, + "step": 3937 + }, + { + "epoch": 0.058270462764102283, + "grad_norm": 1.4712798595428467, + "learning_rate": 4.912976038673786e-05, + "loss": 0.7801, + "step": 3968 + }, + { + "epoch": 0.05872570075444683, + "grad_norm": 1.4899425506591797, + "learning_rate": 4.9107658826866254e-05, + "loss": 0.7852, + "step": 3999 + }, + { + "epoch": 0.05918093874479138, + "grad_norm": 1.2921983003616333, + "learning_rate": 4.908528521534139e-05, + "loss": 0.7735, + "step": 4030 + }, + { + "epoch": 0.05963617673513593, + "grad_norm": 1.3849194049835205, + "learning_rate": 4.906263980464644e-05, + "loss": 0.7893, + "step": 4061 + }, + { + "epoch": 0.06009141472548048, + "grad_norm": 1.442808747291565, + "learning_rate": 4.903972285033178e-05, + "loss": 0.784, + "step": 4092 + }, + { + "epoch": 0.060546652715825024, + "grad_norm": 1.4925036430358887, + "learning_rate": 4.901653461101213e-05, + "loss": 0.7714, + "step": 4123 + }, + { + "epoch": 0.06100189070616958, + "grad_norm": 1.4630024433135986, + "learning_rate": 4.8993075348363626e-05, + "loss": 0.7677, + "step": 4154 + }, + { + "epoch": 0.061457128696514124, + "grad_norm": 1.344081163406372, + "learning_rate": 4.896934532712084e-05, + "loss": 0.7788, + "step": 4185 + }, + { + "epoch": 0.06191236668685868, + "grad_norm": 1.449648380279541, + "learning_rate": 4.8945344815073846e-05, + "loss": 0.7661, + "step": 4216 + }, + { + "epoch": 0.062367604677203224, + "grad_norm": 1.450361728668213, + "learning_rate": 4.892107408306516e-05, + "loss": 0.7698, + "step": 4247 + }, + { + "epoch": 0.06282284266754777, + "grad_norm": 1.5068191289901733, + "learning_rate": 4.889653340498669e-05, + "loss": 0.7758, + "step": 4278 + }, + { + "epoch": 0.06327808065789232, + "grad_norm": 1.3638824224472046, + "learning_rate": 4.8871723057776664e-05, + "loss": 0.7702, + "step": 4309 + }, + { + "epoch": 0.06373331864823686, + "grad_norm": 1.4669525623321533, + "learning_rate": 4.8846643321416476e-05, + "loss": 0.7755, + "step": 4340 + }, + { + "epoch": 0.06418855663858142, + "grad_norm": 1.502684235572815, + "learning_rate": 4.882129447892753e-05, + "loss": 0.7677, + "step": 4371 + }, + { + "epoch": 0.06464379462892597, + "grad_norm": 1.5910828113555908, + "learning_rate": 4.8795676816368076e-05, + "loss": 0.7646, + "step": 4402 + }, + { + "epoch": 0.06509903261927051, + "grad_norm": 1.4607537984848022, + "learning_rate": 4.876979062282995e-05, + "loss": 0.7684, + "step": 4433 + }, + { + "epoch": 0.06555427060961506, + "grad_norm": 1.3823063373565674, + "learning_rate": 4.8743636190435325e-05, + "loss": 0.7695, + "step": 4464 + }, + { + "epoch": 0.06600950859995962, + "grad_norm": 1.3769381046295166, + "learning_rate": 4.871721381433344e-05, + "loss": 0.7746, + "step": 4495 + }, + { + "epoch": 0.06646474659030417, + "grad_norm": 1.3154246807098389, + "learning_rate": 4.869052379269719e-05, + "loss": 0.7699, + "step": 4526 + }, + { + "epoch": 0.06691998458064871, + "grad_norm": 1.4395617246627808, + "learning_rate": 4.866356642671985e-05, + "loss": 0.7654, + "step": 4557 + }, + { + "epoch": 0.06737522257099327, + "grad_norm": 1.333095908164978, + "learning_rate": 4.8636342020611634e-05, + "loss": 0.7729, + "step": 4588 + }, + { + "epoch": 0.06783046056133782, + "grad_norm": 1.3427667617797852, + "learning_rate": 4.860885088159626e-05, + "loss": 0.7701, + "step": 4619 + }, + { + "epoch": 0.06828569855168236, + "grad_norm": 1.4095017910003662, + "learning_rate": 4.858109331990751e-05, + "loss": 0.7711, + "step": 4650 + }, + { + "epoch": 0.06874093654202691, + "grad_norm": 1.4199680089950562, + "learning_rate": 4.855306964878567e-05, + "loss": 0.7797, + "step": 4681 + }, + { + "epoch": 0.06919617453237147, + "grad_norm": 1.3505098819732666, + "learning_rate": 4.8524780184474084e-05, + "loss": 0.7702, + "step": 4712 + }, + { + "epoch": 0.069651412522716, + "grad_norm": 1.3765984773635864, + "learning_rate": 4.8496225246215496e-05, + "loss": 0.7647, + "step": 4743 + }, + { + "epoch": 0.07010665051306056, + "grad_norm": 1.539703369140625, + "learning_rate": 4.8467405156248505e-05, + "loss": 0.7673, + "step": 4774 + }, + { + "epoch": 0.07056188850340511, + "grad_norm": 1.6388696432113647, + "learning_rate": 4.843832023980392e-05, + "loss": 0.7672, + "step": 4805 + }, + { + "epoch": 0.07101712649374965, + "grad_norm": 1.8714436292648315, + "learning_rate": 4.840897082510106e-05, + "loss": 0.7661, + "step": 4836 + }, + { + "epoch": 0.0714723644840942, + "grad_norm": 1.5175424814224243, + "learning_rate": 4.8379357243344084e-05, + "loss": 0.7709, + "step": 4867 + }, + { + "epoch": 0.07192760247443876, + "grad_norm": 1.3103501796722412, + "learning_rate": 4.8349479828718236e-05, + "loss": 0.7701, + "step": 4898 + }, + { + "epoch": 0.0723828404647833, + "grad_norm": 1.4692376852035522, + "learning_rate": 4.8319338918386075e-05, + "loss": 0.7602, + "step": 4929 + }, + { + "epoch": 0.07283807845512785, + "grad_norm": 1.390416145324707, + "learning_rate": 4.828893485248369e-05, + "loss": 0.7896, + "step": 4960 + }, + { + "epoch": 0.0732933164454724, + "grad_norm": 1.4157830476760864, + "learning_rate": 4.825826797411682e-05, + "loss": 0.7675, + "step": 4991 + }, + { + "epoch": 0.07374855443581695, + "grad_norm": 1.2405292987823486, + "learning_rate": 4.822733862935702e-05, + "loss": 0.7679, + "step": 5022 + }, + { + "epoch": 0.0742037924261615, + "grad_norm": 1.4902536869049072, + "learning_rate": 4.819614716723775e-05, + "loss": 0.7693, + "step": 5053 + }, + { + "epoch": 0.07465903041650605, + "grad_norm": 1.4391045570373535, + "learning_rate": 4.8164693939750425e-05, + "loss": 0.7699, + "step": 5084 + }, + { + "epoch": 0.07511426840685059, + "grad_norm": 1.4432806968688965, + "learning_rate": 4.813297930184042e-05, + "loss": 0.763, + "step": 5115 + }, + { + "epoch": 0.07556950639719515, + "grad_norm": 1.3630146980285645, + "learning_rate": 4.810100361140314e-05, + "loss": 0.7618, + "step": 5146 + }, + { + "epoch": 0.0760247443875397, + "grad_norm": 1.5038788318634033, + "learning_rate": 4.8068767229279885e-05, + "loss": 0.7685, + "step": 5177 + }, + { + "epoch": 0.07647998237788424, + "grad_norm": 1.4100326299667358, + "learning_rate": 4.8036270519253854e-05, + "loss": 0.7645, + "step": 5208 + }, + { + "epoch": 0.0769352203682288, + "grad_norm": 1.377821445465088, + "learning_rate": 4.8003513848046e-05, + "loss": 0.7643, + "step": 5239 + }, + { + "epoch": 0.07739045835857335, + "grad_norm": 1.3848881721496582, + "learning_rate": 4.79704975853109e-05, + "loss": 0.7711, + "step": 5270 + }, + { + "epoch": 0.07784569634891789, + "grad_norm": 1.6327494382858276, + "learning_rate": 4.793722210363262e-05, + "loss": 0.7591, + "step": 5301 + }, + { + "epoch": 0.07830093433926244, + "grad_norm": 1.51273512840271, + "learning_rate": 4.7903687778520414e-05, + "loss": 0.7697, + "step": 5332 + }, + { + "epoch": 0.078756172329607, + "grad_norm": 1.447487235069275, + "learning_rate": 4.7869894988404593e-05, + "loss": 0.7602, + "step": 5363 + }, + { + "epoch": 0.07921141031995153, + "grad_norm": 1.3227125406265259, + "learning_rate": 4.783584411463221e-05, + "loss": 0.7616, + "step": 5394 + }, + { + "epoch": 0.07966664831029609, + "grad_norm": 1.4407910108566284, + "learning_rate": 4.780153554146274e-05, + "loss": 0.7618, + "step": 5425 + }, + { + "epoch": 0.08012188630064064, + "grad_norm": 1.2349289655685425, + "learning_rate": 4.7766969656063766e-05, + "loss": 0.7688, + "step": 5456 + }, + { + "epoch": 0.08057712429098518, + "grad_norm": 1.3858132362365723, + "learning_rate": 4.773214684850662e-05, + "loss": 0.7517, + "step": 5487 + }, + { + "epoch": 0.08103236228132973, + "grad_norm": 1.2951774597167969, + "learning_rate": 4.769706751176193e-05, + "loss": 0.7588, + "step": 5518 + }, + { + "epoch": 0.08148760027167429, + "grad_norm": 1.2295372486114502, + "learning_rate": 4.7661732041695264e-05, + "loss": 0.7522, + "step": 5549 + }, + { + "epoch": 0.08194283826201883, + "grad_norm": 1.4258685111999512, + "learning_rate": 4.762614083706258e-05, + "loss": 0.7666, + "step": 5580 + }, + { + "epoch": 0.08239807625236338, + "grad_norm": 1.5323545932769775, + "learning_rate": 4.759029429950581e-05, + "loss": 0.7587, + "step": 5611 + }, + { + "epoch": 0.08285331424270793, + "grad_norm": 1.3712198734283447, + "learning_rate": 4.7554192833548235e-05, + "loss": 0.7597, + "step": 5642 + }, + { + "epoch": 0.08330855223305247, + "grad_norm": 1.389145016670227, + "learning_rate": 4.751783684659e-05, + "loss": 0.7484, + "step": 5673 + }, + { + "epoch": 0.08376379022339703, + "grad_norm": 1.466799020767212, + "learning_rate": 4.748122674890348e-05, + "loss": 0.7586, + "step": 5704 + }, + { + "epoch": 0.08421902821374158, + "grad_norm": 1.4522569179534912, + "learning_rate": 4.7444362953628654e-05, + "loss": 0.7615, + "step": 5735 + }, + { + "epoch": 0.08467426620408613, + "grad_norm": 1.6374691724777222, + "learning_rate": 4.7407245876768424e-05, + "loss": 0.7573, + "step": 5766 + }, + { + "epoch": 0.08512950419443067, + "grad_norm": 1.384748101234436, + "learning_rate": 4.736987593718397e-05, + "loss": 0.7516, + "step": 5797 + }, + { + "epoch": 0.08558474218477523, + "grad_norm": 1.366335153579712, + "learning_rate": 4.733225355658999e-05, + "loss": 0.7681, + "step": 5828 + }, + { + "epoch": 0.08603998017511978, + "grad_norm": 1.3855708837509155, + "learning_rate": 4.7294379159549926e-05, + "loss": 0.7557, + "step": 5859 + }, + { + "epoch": 0.08649521816546432, + "grad_norm": 1.3614453077316284, + "learning_rate": 4.725625317347119e-05, + "loss": 0.7624, + "step": 5890 + }, + { + "epoch": 0.08695045615580888, + "grad_norm": 1.2072994709014893, + "learning_rate": 4.7217876028600374e-05, + "loss": 0.7545, + "step": 5921 + }, + { + "epoch": 0.08740569414615343, + "grad_norm": 1.5530472993850708, + "learning_rate": 4.717924815801832e-05, + "loss": 0.7574, + "step": 5952 + }, + { + "epoch": 0.08786093213649797, + "grad_norm": 1.3522703647613525, + "learning_rate": 4.714036999763532e-05, + "loss": 0.7608, + "step": 5983 + }, + { + "epoch": 0.08831617012684252, + "grad_norm": 1.342637300491333, + "learning_rate": 4.7101241986186116e-05, + "loss": 0.7519, + "step": 6014 + }, + { + "epoch": 0.08877140811718708, + "grad_norm": 1.2783684730529785, + "learning_rate": 4.7061864565225e-05, + "loss": 0.7565, + "step": 6045 + }, + { + "epoch": 0.08922664610753162, + "grad_norm": 1.1245291233062744, + "learning_rate": 4.702223817912081e-05, + "loss": 0.7655, + "step": 6076 + }, + { + "epoch": 0.08968188409787617, + "grad_norm": 1.2777146100997925, + "learning_rate": 4.698236327505195e-05, + "loss": 0.7526, + "step": 6107 + }, + { + "epoch": 0.09013712208822072, + "grad_norm": 1.2739795446395874, + "learning_rate": 4.694224030300127e-05, + "loss": 0.7594, + "step": 6138 + }, + { + "epoch": 0.09059236007856526, + "grad_norm": 1.358684778213501, + "learning_rate": 4.690186971575107e-05, + "loss": 0.7599, + "step": 6169 + }, + { + "epoch": 0.09104759806890982, + "grad_norm": 1.2908906936645508, + "learning_rate": 4.6861251968877916e-05, + "loss": 0.7564, + "step": 6200 + }, + { + "epoch": 0.09150283605925437, + "grad_norm": 1.2198729515075684, + "learning_rate": 4.68203875207476e-05, + "loss": 0.7595, + "step": 6231 + }, + { + "epoch": 0.09195807404959891, + "grad_norm": 1.337134599685669, + "learning_rate": 4.677927683250983e-05, + "loss": 0.7512, + "step": 6262 + }, + { + "epoch": 0.09241331203994346, + "grad_norm": 1.4034923315048218, + "learning_rate": 4.6737920368093156e-05, + "loss": 0.7587, + "step": 6293 + }, + { + "epoch": 0.09286855003028802, + "grad_norm": 1.2584336996078491, + "learning_rate": 4.669631859419965e-05, + "loss": 0.7532, + "step": 6324 + }, + { + "epoch": 0.09332378802063256, + "grad_norm": 1.4327120780944824, + "learning_rate": 4.6654471980299676e-05, + "loss": 0.7528, + "step": 6355 + }, + { + "epoch": 0.09377902601097711, + "grad_norm": 1.386634111404419, + "learning_rate": 4.661238099862658e-05, + "loss": 0.7455, + "step": 6386 + }, + { + "epoch": 0.09423426400132166, + "grad_norm": 1.373677372932434, + "learning_rate": 4.657004612417138e-05, + "loss": 0.7519, + "step": 6417 + }, + { + "epoch": 0.0946895019916662, + "grad_norm": 1.3846200704574585, + "learning_rate": 4.6527467834677374e-05, + "loss": 0.7505, + "step": 6448 + }, + { + "epoch": 0.09514473998201076, + "grad_norm": 1.3237924575805664, + "learning_rate": 4.648464661063478e-05, + "loss": 0.7562, + "step": 6479 + }, + { + "epoch": 0.09559997797235531, + "grad_norm": 1.3368539810180664, + "learning_rate": 4.6441582935275264e-05, + "loss": 0.7523, + "step": 6510 + }, + { + "epoch": 0.09605521596269985, + "grad_norm": 1.1844121217727661, + "learning_rate": 4.6398277294566586e-05, + "loss": 0.7535, + "step": 6541 + }, + { + "epoch": 0.0965104539530444, + "grad_norm": 1.3107521533966064, + "learning_rate": 4.6354730177207e-05, + "loss": 0.7609, + "step": 6572 + }, + { + "epoch": 0.09696569194338896, + "grad_norm": 1.3678165674209595, + "learning_rate": 4.6310942074619787e-05, + "loss": 0.7513, + "step": 6603 + }, + { + "epoch": 0.0974209299337335, + "grad_norm": 1.2169839143753052, + "learning_rate": 4.626691348094777e-05, + "loss": 0.7464, + "step": 6634 + }, + { + "epoch": 0.09787616792407805, + "grad_norm": 1.5456466674804688, + "learning_rate": 4.622264489304762e-05, + "loss": 0.7431, + "step": 6665 + }, + { + "epoch": 0.0983314059144226, + "grad_norm": 1.3606510162353516, + "learning_rate": 4.617813681048434e-05, + "loss": 0.7481, + "step": 6696 + }, + { + "epoch": 0.09878664390476714, + "grad_norm": 1.1854835748672485, + "learning_rate": 4.61333897355256e-05, + "loss": 0.7491, + "step": 6727 + }, + { + "epoch": 0.0992418818951117, + "grad_norm": 1.1682158708572388, + "learning_rate": 4.608840417313604e-05, + "loss": 0.7433, + "step": 6758 + }, + { + "epoch": 0.09969711988545625, + "grad_norm": 1.3889473676681519, + "learning_rate": 4.6043180630971646e-05, + "loss": 0.7414, + "step": 6789 + }, + { + "epoch": 0.10015235787580079, + "grad_norm": 1.351807713508606, + "learning_rate": 4.599771961937391e-05, + "loss": 0.7501, + "step": 6820 + }, + { + "epoch": 0.10060759586614534, + "grad_norm": 1.4737526178359985, + "learning_rate": 4.5952021651364204e-05, + "loss": 0.7508, + "step": 6851 + }, + { + "epoch": 0.1010628338564899, + "grad_norm": 1.3223670721054077, + "learning_rate": 4.590608724263786e-05, + "loss": 0.7557, + "step": 6882 + }, + { + "epoch": 0.10151807184683444, + "grad_norm": 1.182531714439392, + "learning_rate": 4.585991691155845e-05, + "loss": 0.7595, + "step": 6913 + }, + { + "epoch": 0.10197330983717899, + "grad_norm": 1.3309706449508667, + "learning_rate": 4.581351117915188e-05, + "loss": 0.7516, + "step": 6944 + }, + { + "epoch": 0.10242854782752354, + "grad_norm": 1.1631003618240356, + "learning_rate": 4.5766870569100534e-05, + "loss": 0.7538, + "step": 6975 + }, + { + "epoch": 0.10288378581786808, + "grad_norm": 1.150242805480957, + "learning_rate": 4.571999560773736e-05, + "loss": 0.7583, + "step": 7006 + }, + { + "epoch": 0.10333902380821264, + "grad_norm": 1.2031753063201904, + "learning_rate": 4.5672886824039915e-05, + "loss": 0.743, + "step": 7037 + }, + { + "epoch": 0.10379426179855719, + "grad_norm": 1.3973835706710815, + "learning_rate": 4.5625544749624435e-05, + "loss": 0.7477, + "step": 7068 + }, + { + "epoch": 0.10424949978890174, + "grad_norm": 1.3755607604980469, + "learning_rate": 4.5577969918739794e-05, + "loss": 0.7424, + "step": 7099 + }, + { + "epoch": 0.10470473777924628, + "grad_norm": 1.2959696054458618, + "learning_rate": 4.5530162868261486e-05, + "loss": 0.7432, + "step": 7130 + }, + { + "epoch": 0.10515997576959084, + "grad_norm": 1.4843213558197021, + "learning_rate": 4.548212413768558e-05, + "loss": 0.7519, + "step": 7161 + }, + { + "epoch": 0.10561521375993539, + "grad_norm": 1.3697874546051025, + "learning_rate": 4.543385426912261e-05, + "loss": 0.7546, + "step": 7192 + }, + { + "epoch": 0.10607045175027993, + "grad_norm": 1.3163423538208008, + "learning_rate": 4.53853538072915e-05, + "loss": 0.7328, + "step": 7223 + }, + { + "epoch": 0.10652568974062449, + "grad_norm": 1.200897455215454, + "learning_rate": 4.533662329951336e-05, + "loss": 0.7438, + "step": 7254 + }, + { + "epoch": 0.10698092773096904, + "grad_norm": 1.3377361297607422, + "learning_rate": 4.528766329570536e-05, + "loss": 0.7478, + "step": 7285 + }, + { + "epoch": 0.10743616572131358, + "grad_norm": 1.2304787635803223, + "learning_rate": 4.523847434837447e-05, + "loss": 0.7468, + "step": 7316 + }, + { + "epoch": 0.10789140371165813, + "grad_norm": 1.1947201490402222, + "learning_rate": 4.518905701261128e-05, + "loss": 0.7483, + "step": 7347 + }, + { + "epoch": 0.10834664170200269, + "grad_norm": 1.1942620277404785, + "learning_rate": 4.5139411846083715e-05, + "loss": 0.7522, + "step": 7378 + }, + { + "epoch": 0.10880187969234723, + "grad_norm": 1.2765998840332031, + "learning_rate": 4.508953940903073e-05, + "loss": 0.7425, + "step": 7409 + }, + { + "epoch": 0.10925711768269178, + "grad_norm": 1.1460707187652588, + "learning_rate": 4.5039440264255994e-05, + "loss": 0.7534, + "step": 7440 + }, + { + "epoch": 0.10971235567303633, + "grad_norm": 1.1739740371704102, + "learning_rate": 4.498911497712155e-05, + "loss": 0.7447, + "step": 7471 + }, + { + "epoch": 0.11016759366338087, + "grad_norm": 1.227232575416565, + "learning_rate": 4.493856411554142e-05, + "loss": 0.7452, + "step": 7502 + }, + { + "epoch": 0.11062283165372543, + "grad_norm": 1.3812710046768188, + "learning_rate": 4.4887788249975206e-05, + "loss": 0.7431, + "step": 7533 + }, + { + "epoch": 0.11107806964406998, + "grad_norm": 1.1743565797805786, + "learning_rate": 4.4836787953421656e-05, + "loss": 0.7421, + "step": 7564 + }, + { + "epoch": 0.11153330763441452, + "grad_norm": 1.2436307668685913, + "learning_rate": 4.478556380141218e-05, + "loss": 0.7461, + "step": 7595 + }, + { + "epoch": 0.11198854562475907, + "grad_norm": 1.2472264766693115, + "learning_rate": 4.4734116372004375e-05, + "loss": 0.7456, + "step": 7626 + }, + { + "epoch": 0.11244378361510363, + "grad_norm": 1.2209473848342896, + "learning_rate": 4.4682446245775477e-05, + "loss": 0.7544, + "step": 7657 + }, + { + "epoch": 0.11289902160544817, + "grad_norm": 1.3582866191864014, + "learning_rate": 4.463055400581586e-05, + "loss": 0.7464, + "step": 7688 + }, + { + "epoch": 0.11335425959579272, + "grad_norm": 1.2489479780197144, + "learning_rate": 4.4578440237722374e-05, + "loss": 0.7394, + "step": 7719 + }, + { + "epoch": 0.11380949758613727, + "grad_norm": 1.2384752035140991, + "learning_rate": 4.452610552959183e-05, + "loss": 0.7358, + "step": 7750 + }, + { + "epoch": 0.11426473557648181, + "grad_norm": 1.3618046045303345, + "learning_rate": 4.447355047201428e-05, + "loss": 0.742, + "step": 7781 + }, + { + "epoch": 0.11471997356682637, + "grad_norm": 1.2598398923873901, + "learning_rate": 4.4420775658066414e-05, + "loss": 0.7407, + "step": 7812 + }, + { + "epoch": 0.11517521155717092, + "grad_norm": 1.1453444957733154, + "learning_rate": 4.436778168330484e-05, + "loss": 0.7499, + "step": 7843 + }, + { + "epoch": 0.11563044954751546, + "grad_norm": 1.3463783264160156, + "learning_rate": 4.4314569145759353e-05, + "loss": 0.7388, + "step": 7874 + }, + { + "epoch": 0.11608568753786001, + "grad_norm": 1.2431267499923706, + "learning_rate": 4.42611386459262e-05, + "loss": 0.7487, + "step": 7905 + }, + { + "epoch": 0.11654092552820457, + "grad_norm": 1.2726207971572876, + "learning_rate": 4.420749078676133e-05, + "loss": 0.7468, + "step": 7936 + }, + { + "epoch": 0.1169961635185491, + "grad_norm": 1.1979873180389404, + "learning_rate": 4.4153626173673516e-05, + "loss": 0.7482, + "step": 7967 + }, + { + "epoch": 0.11745140150889366, + "grad_norm": 1.2321964502334595, + "learning_rate": 4.409954541451762e-05, + "loss": 0.7363, + "step": 7998 + }, + { + "epoch": 0.11790663949923821, + "grad_norm": 1.265199065208435, + "learning_rate": 4.404524911958764e-05, + "loss": 0.7466, + "step": 8029 + }, + { + "epoch": 0.11836187748958275, + "grad_norm": 1.3418115377426147, + "learning_rate": 4.399073790160989e-05, + "loss": 0.7433, + "step": 8060 + }, + { + "epoch": 0.11881711547992731, + "grad_norm": 1.276666522026062, + "learning_rate": 4.393601237573607e-05, + "loss": 0.7457, + "step": 8091 + }, + { + "epoch": 0.11927235347027186, + "grad_norm": 1.3559373617172241, + "learning_rate": 4.388107315953628e-05, + "loss": 0.7451, + "step": 8122 + }, + { + "epoch": 0.1197275914606164, + "grad_norm": 1.3017044067382812, + "learning_rate": 4.382592087299212e-05, + "loss": 0.736, + "step": 8153 + }, + { + "epoch": 0.12018282945096095, + "grad_norm": 1.230873703956604, + "learning_rate": 4.377055613848964e-05, + "loss": 0.7409, + "step": 8184 + }, + { + "epoch": 0.12063806744130551, + "grad_norm": 1.3415979146957397, + "learning_rate": 4.3714979580812355e-05, + "loss": 0.7456, + "step": 8215 + }, + { + "epoch": 0.12109330543165005, + "grad_norm": 1.3706104755401611, + "learning_rate": 4.365919182713416e-05, + "loss": 0.7425, + "step": 8246 + }, + { + "epoch": 0.1215485434219946, + "grad_norm": 1.3071256875991821, + "learning_rate": 4.360319350701226e-05, + "loss": 0.7312, + "step": 8277 + }, + { + "epoch": 0.12200378141233915, + "grad_norm": 1.1925092935562134, + "learning_rate": 4.3546985252380115e-05, + "loss": 0.7457, + "step": 8308 + }, + { + "epoch": 0.12245901940268371, + "grad_norm": 1.2254852056503296, + "learning_rate": 4.349056769754021e-05, + "loss": 0.7454, + "step": 8339 + }, + { + "epoch": 0.12291425739302825, + "grad_norm": 1.2121927738189697, + "learning_rate": 4.3433941479156994e-05, + "loss": 0.7451, + "step": 8370 + }, + { + "epoch": 0.1233694953833728, + "grad_norm": 1.4518349170684814, + "learning_rate": 4.3377107236249647e-05, + "loss": 0.7376, + "step": 8401 + }, + { + "epoch": 0.12382473337371736, + "grad_norm": 1.2487318515777588, + "learning_rate": 4.332006561018488e-05, + "loss": 0.7468, + "step": 8432 + }, + { + "epoch": 0.1242799713640619, + "grad_norm": 1.2601664066314697, + "learning_rate": 4.3262817244669683e-05, + "loss": 0.7341, + "step": 8463 + }, + { + "epoch": 0.12473520935440645, + "grad_norm": 2.2060904502868652, + "learning_rate": 4.3205362785744083e-05, + "loss": 0.7421, + "step": 8494 + }, + { + "epoch": 0.125190447344751, + "grad_norm": 1.2388501167297363, + "learning_rate": 4.314770288177384e-05, + "loss": 0.7372, + "step": 8525 + }, + { + "epoch": 0.12564568533509554, + "grad_norm": 1.20314359664917, + "learning_rate": 4.308983818344313e-05, + "loss": 0.7308, + "step": 8556 + }, + { + "epoch": 0.1261009233254401, + "grad_norm": 1.2387797832489014, + "learning_rate": 4.3031769343747206e-05, + "loss": 0.7364, + "step": 8587 + }, + { + "epoch": 0.12655616131578465, + "grad_norm": 1.2150596380233765, + "learning_rate": 4.297349701798505e-05, + "loss": 0.7339, + "step": 8618 + }, + { + "epoch": 0.1270113993061292, + "grad_norm": 1.3160388469696045, + "learning_rate": 4.2915021863751916e-05, + "loss": 0.7448, + "step": 8649 + }, + { + "epoch": 0.12746663729647373, + "grad_norm": 1.2065149545669556, + "learning_rate": 4.285634454093198e-05, + "loss": 0.7373, + "step": 8680 + }, + { + "epoch": 0.12792187528681828, + "grad_norm": 1.2958154678344727, + "learning_rate": 4.279746571169086e-05, + "loss": 0.7282, + "step": 8711 + }, + { + "epoch": 0.12837711327716284, + "grad_norm": 1.2723522186279297, + "learning_rate": 4.2738386040468136e-05, + "loss": 0.7274, + "step": 8742 + }, + { + "epoch": 0.1288323512675074, + "grad_norm": 1.1709808111190796, + "learning_rate": 4.2679106193969866e-05, + "loss": 0.7459, + "step": 8773 + }, + { + "epoch": 0.12928758925785194, + "grad_norm": 1.3450511693954468, + "learning_rate": 4.261962684116106e-05, + "loss": 0.7334, + "step": 8804 + }, + { + "epoch": 0.1297428272481965, + "grad_norm": 1.3592984676361084, + "learning_rate": 4.2559948653258145e-05, + "loss": 0.7415, + "step": 8835 + }, + { + "epoch": 0.13019806523854102, + "grad_norm": 1.2315089702606201, + "learning_rate": 4.250007230372134e-05, + "loss": 0.7296, + "step": 8866 + }, + { + "epoch": 0.13065330322888558, + "grad_norm": 1.0693110227584839, + "learning_rate": 4.2439998468247126e-05, + "loss": 0.7269, + "step": 8897 + }, + { + "epoch": 0.13110854121923013, + "grad_norm": 1.0341922044754028, + "learning_rate": 4.2379727824760566e-05, + "loss": 0.7257, + "step": 8928 + }, + { + "epoch": 0.13156377920957468, + "grad_norm": 1.3500678539276123, + "learning_rate": 4.231926105340768e-05, + "loss": 0.7379, + "step": 8959 + }, + { + "epoch": 0.13201901719991924, + "grad_norm": 1.3482787609100342, + "learning_rate": 4.225859883654776e-05, + "loss": 0.7393, + "step": 8990 + }, + { + "epoch": 0.1324742551902638, + "grad_norm": 1.3126060962677002, + "learning_rate": 4.219774185874569e-05, + "loss": 0.7391, + "step": 9021 + }, + { + "epoch": 0.13292949318060834, + "grad_norm": 1.2438606023788452, + "learning_rate": 4.213669080676418e-05, + "loss": 0.7347, + "step": 9052 + }, + { + "epoch": 0.13338473117095287, + "grad_norm": 1.3756070137023926, + "learning_rate": 4.2075446369556056e-05, + "loss": 0.7321, + "step": 9083 + }, + { + "epoch": 0.13383996916129742, + "grad_norm": 1.189840316772461, + "learning_rate": 4.201400923825648e-05, + "loss": 0.7352, + "step": 9114 + }, + { + "epoch": 0.13429520715164198, + "grad_norm": 1.1982426643371582, + "learning_rate": 4.195238010617511e-05, + "loss": 0.7388, + "step": 9145 + }, + { + "epoch": 0.13475044514198653, + "grad_norm": 1.226491928100586, + "learning_rate": 4.1890559668788344e-05, + "loss": 0.7349, + "step": 9176 + }, + { + "epoch": 0.13520568313233108, + "grad_norm": 1.3030654191970825, + "learning_rate": 4.1828548623731405e-05, + "loss": 0.7371, + "step": 9207 + }, + { + "epoch": 0.13566092112267564, + "grad_norm": 1.17802894115448, + "learning_rate": 4.1766347670790506e-05, + "loss": 0.7431, + "step": 9238 + }, + { + "epoch": 0.13611615911302016, + "grad_norm": 1.2893658876419067, + "learning_rate": 4.170395751189495e-05, + "loss": 0.7258, + "step": 9269 + }, + { + "epoch": 0.13657139710336472, + "grad_norm": 1.193758487701416, + "learning_rate": 4.164137885110921e-05, + "loss": 0.7362, + "step": 9300 + }, + { + "epoch": 0.13702663509370927, + "grad_norm": 1.2750978469848633, + "learning_rate": 4.157861239462495e-05, + "loss": 0.7265, + "step": 9331 + }, + { + "epoch": 0.13748187308405382, + "grad_norm": 1.161464810371399, + "learning_rate": 4.1515658850753114e-05, + "loss": 0.7389, + "step": 9362 + }, + { + "epoch": 0.13793711107439838, + "grad_norm": 1.2749390602111816, + "learning_rate": 4.145251892991588e-05, + "loss": 0.7315, + "step": 9393 + }, + { + "epoch": 0.13839234906474293, + "grad_norm": 1.1729388236999512, + "learning_rate": 4.138919334463868e-05, + "loss": 0.7352, + "step": 9424 + }, + { + "epoch": 0.13884758705508746, + "grad_norm": 1.0987122058868408, + "learning_rate": 4.1325682809542124e-05, + "loss": 0.7347, + "step": 9455 + }, + { + "epoch": 0.139302825045432, + "grad_norm": 1.0855766534805298, + "learning_rate": 4.126198804133398e-05, + "loss": 0.7271, + "step": 9486 + }, + { + "epoch": 0.13975806303577656, + "grad_norm": 1.1782172918319702, + "learning_rate": 4.1198109758801055e-05, + "loss": 0.74, + "step": 9517 + }, + { + "epoch": 0.14021330102612112, + "grad_norm": 1.186962366104126, + "learning_rate": 4.113404868280107e-05, + "loss": 0.725, + "step": 9548 + }, + { + "epoch": 0.14066853901646567, + "grad_norm": 1.2993977069854736, + "learning_rate": 4.106980553625457e-05, + "loss": 0.7346, + "step": 9579 + }, + { + "epoch": 0.14112377700681022, + "grad_norm": 1.154897928237915, + "learning_rate": 4.100538104413674e-05, + "loss": 0.7348, + "step": 9610 + }, + { + "epoch": 0.14157901499715475, + "grad_norm": 1.203722357749939, + "learning_rate": 4.09407759334692e-05, + "loss": 0.7312, + "step": 9641 + }, + { + "epoch": 0.1420342529874993, + "grad_norm": 1.102858543395996, + "learning_rate": 4.087599093331186e-05, + "loss": 0.7227, + "step": 9672 + }, + { + "epoch": 0.14248949097784386, + "grad_norm": 1.2667406797409058, + "learning_rate": 4.081102677475462e-05, + "loss": 0.727, + "step": 9703 + }, + { + "epoch": 0.1429447289681884, + "grad_norm": 1.221291422843933, + "learning_rate": 4.0745884190909194e-05, + "loss": 0.7317, + "step": 9734 + }, + { + "epoch": 0.14339996695853297, + "grad_norm": 1.0426162481307983, + "learning_rate": 4.0680563916900796e-05, + "loss": 0.7299, + "step": 9765 + }, + { + "epoch": 0.14385520494887752, + "grad_norm": 1.2158912420272827, + "learning_rate": 4.0615066689859815e-05, + "loss": 0.7399, + "step": 9796 + }, + { + "epoch": 0.14431044293922204, + "grad_norm": 1.2689307928085327, + "learning_rate": 4.0549393248913584e-05, + "loss": 0.721, + "step": 9827 + }, + { + "epoch": 0.1447656809295666, + "grad_norm": 1.3168833255767822, + "learning_rate": 4.048354433517794e-05, + "loss": 0.7258, + "step": 9858 + }, + { + "epoch": 0.14522091891991115, + "grad_norm": 1.1966122388839722, + "learning_rate": 4.0417520691748916e-05, + "loss": 0.7375, + "step": 9889 + }, + { + "epoch": 0.1456761569102557, + "grad_norm": 1.1252835988998413, + "learning_rate": 4.035132306369438e-05, + "loss": 0.7254, + "step": 9920 + }, + { + "epoch": 0.14613139490060026, + "grad_norm": 1.186324119567871, + "learning_rate": 4.028495219804555e-05, + "loss": 0.7337, + "step": 9951 + }, + { + "epoch": 0.1465866328909448, + "grad_norm": 1.1606496572494507, + "learning_rate": 4.021840884378864e-05, + "loss": 0.7273, + "step": 9982 + }, + { + "epoch": 0.14704187088128934, + "grad_norm": 1.316298007965088, + "learning_rate": 4.015169375185633e-05, + "loss": 0.7276, + "step": 10013 + }, + { + "epoch": 0.1474971088716339, + "grad_norm": 1.224236249923706, + "learning_rate": 4.0084807675119396e-05, + "loss": 0.7328, + "step": 10044 + }, + { + "epoch": 0.14795234686197845, + "grad_norm": 1.227148175239563, + "learning_rate": 4.0017751368378106e-05, + "loss": 0.739, + "step": 10075 + }, + { + "epoch": 0.148407584852323, + "grad_norm": 1.1307867765426636, + "learning_rate": 3.995052558835377e-05, + "loss": 0.7324, + "step": 10106 + }, + { + "epoch": 0.14886282284266755, + "grad_norm": 1.2655375003814697, + "learning_rate": 3.988313109368017e-05, + "loss": 0.7263, + "step": 10137 + }, + { + "epoch": 0.1493180608330121, + "grad_norm": 1.1724634170532227, + "learning_rate": 3.981556864489504e-05, + "loss": 0.7289, + "step": 10168 + }, + { + "epoch": 0.14977329882335666, + "grad_norm": 1.140684723854065, + "learning_rate": 3.974783900443142e-05, + "loss": 0.7309, + "step": 10199 + }, + { + "epoch": 0.15022853681370119, + "grad_norm": 1.167183756828308, + "learning_rate": 3.9679942936609095e-05, + "loss": 0.729, + "step": 10230 + }, + { + "epoch": 0.15068377480404574, + "grad_norm": 1.2216546535491943, + "learning_rate": 3.961188120762596e-05, + "loss": 0.7288, + "step": 10261 + }, + { + "epoch": 0.1511390127943903, + "grad_norm": 1.227397084236145, + "learning_rate": 3.954365458554938e-05, + "loss": 0.7313, + "step": 10292 + }, + { + "epoch": 0.15159425078473485, + "grad_norm": 1.075441837310791, + "learning_rate": 3.947526384030751e-05, + "loss": 0.7306, + "step": 10323 + }, + { + "epoch": 0.1520494887750794, + "grad_norm": 1.1227167844772339, + "learning_rate": 3.9406709743680624e-05, + "loss": 0.7287, + "step": 10354 + }, + { + "epoch": 0.15250472676542395, + "grad_norm": 1.0665740966796875, + "learning_rate": 3.9337993069292366e-05, + "loss": 0.7246, + "step": 10385 + }, + { + "epoch": 0.15295996475576848, + "grad_norm": 1.250580072402954, + "learning_rate": 3.926911459260109e-05, + "loss": 0.7183, + "step": 10416 + }, + { + "epoch": 0.15341520274611303, + "grad_norm": 1.1809351444244385, + "learning_rate": 3.920007509089102e-05, + "loss": 0.7307, + "step": 10447 + }, + { + "epoch": 0.1538704407364576, + "grad_norm": 1.3934037685394287, + "learning_rate": 3.913087534326357e-05, + "loss": 0.7151, + "step": 10478 + }, + { + "epoch": 0.15432567872680214, + "grad_norm": 1.1272441148757935, + "learning_rate": 3.9061516130628475e-05, + "loss": 0.7309, + "step": 10509 + }, + { + "epoch": 0.1547809167171467, + "grad_norm": 1.2410191297531128, + "learning_rate": 3.8991998235695025e-05, + "loss": 0.7375, + "step": 10540 + }, + { + "epoch": 0.15523615470749125, + "grad_norm": 1.1426646709442139, + "learning_rate": 3.8922322442963224e-05, + "loss": 0.7342, + "step": 10571 + }, + { + "epoch": 0.15569139269783577, + "grad_norm": 1.2037526369094849, + "learning_rate": 3.885248953871491e-05, + "loss": 0.7329, + "step": 10602 + }, + { + "epoch": 0.15614663068818033, + "grad_norm": 1.2198660373687744, + "learning_rate": 3.8782500311004915e-05, + "loss": 0.7367, + "step": 10633 + }, + { + "epoch": 0.15660186867852488, + "grad_norm": 1.3129019737243652, + "learning_rate": 3.871235554965218e-05, + "loss": 0.7236, + "step": 10664 + }, + { + "epoch": 0.15705710666886943, + "grad_norm": 1.2182697057724, + "learning_rate": 3.864205604623078e-05, + "loss": 0.721, + "step": 10695 + }, + { + "epoch": 0.157512344659214, + "grad_norm": 1.1494821310043335, + "learning_rate": 3.857160259406107e-05, + "loss": 0.7223, + "step": 10726 + }, + { + "epoch": 0.15796758264955854, + "grad_norm": 1.0680466890335083, + "learning_rate": 3.8500995988200674e-05, + "loss": 0.7224, + "step": 10757 + }, + { + "epoch": 0.15842282063990307, + "grad_norm": 1.155052661895752, + "learning_rate": 3.843023702543556e-05, + "loss": 0.7268, + "step": 10788 + }, + { + "epoch": 0.15887805863024762, + "grad_norm": 1.1415716409683228, + "learning_rate": 3.8359326504270984e-05, + "loss": 0.7167, + "step": 10819 + }, + { + "epoch": 0.15933329662059217, + "grad_norm": 1.1486759185791016, + "learning_rate": 3.828826522492255e-05, + "loss": 0.7236, + "step": 10850 + }, + { + "epoch": 0.15978853461093673, + "grad_norm": 1.1916829347610474, + "learning_rate": 3.821705398930713e-05, + "loss": 0.7269, + "step": 10881 + }, + { + "epoch": 0.16024377260128128, + "grad_norm": 1.180760383605957, + "learning_rate": 3.814569360103385e-05, + "loss": 0.724, + "step": 10912 + }, + { + "epoch": 0.16069901059162583, + "grad_norm": 1.197651982307434, + "learning_rate": 3.807418486539499e-05, + "loss": 0.7265, + "step": 10943 + }, + { + "epoch": 0.16115424858197036, + "grad_norm": 1.1579455137252808, + "learning_rate": 3.80025285893569e-05, + "loss": 0.7247, + "step": 10974 + }, + { + "epoch": 0.16160948657231491, + "grad_norm": 1.1810616254806519, + "learning_rate": 3.793072558155093e-05, + "loss": 0.7256, + "step": 11005 + }, + { + "epoch": 0.16206472456265947, + "grad_norm": 1.1510332822799683, + "learning_rate": 3.785877665226426e-05, + "loss": 0.722, + "step": 11036 + }, + { + "epoch": 0.16251996255300402, + "grad_norm": 1.0499261617660522, + "learning_rate": 3.778668261343079e-05, + "loss": 0.7255, + "step": 11067 + }, + { + "epoch": 0.16297520054334858, + "grad_norm": 1.1509817838668823, + "learning_rate": 3.771444427862192e-05, + "loss": 0.7147, + "step": 11098 + }, + { + "epoch": 0.16343043853369313, + "grad_norm": 1.1638833284378052, + "learning_rate": 3.7642062463037465e-05, + "loss": 0.7187, + "step": 11129 + }, + { + "epoch": 0.16388567652403765, + "grad_norm": 1.024095058441162, + "learning_rate": 3.7569537983496373e-05, + "loss": 0.7244, + "step": 11160 + }, + { + "epoch": 0.1643409145143822, + "grad_norm": 1.1326078176498413, + "learning_rate": 3.749687165842753e-05, + "loss": 0.7221, + "step": 11191 + }, + { + "epoch": 0.16479615250472676, + "grad_norm": 1.1863468885421753, + "learning_rate": 3.7424064307860536e-05, + "loss": 0.725, + "step": 11222 + }, + { + "epoch": 0.16525139049507132, + "grad_norm": 1.2948099374771118, + "learning_rate": 3.735111675341645e-05, + "loss": 0.7249, + "step": 11253 + }, + { + "epoch": 0.16570662848541587, + "grad_norm": 1.1384631395339966, + "learning_rate": 3.7278029818298524e-05, + "loss": 0.7203, + "step": 11284 + }, + { + "epoch": 0.16616186647576042, + "grad_norm": 1.1335748434066772, + "learning_rate": 3.720480432728287e-05, + "loss": 0.7183, + "step": 11315 + }, + { + "epoch": 0.16661710446610495, + "grad_norm": 1.2514820098876953, + "learning_rate": 3.71314411067092e-05, + "loss": 0.7216, + "step": 11346 + }, + { + "epoch": 0.1670723424564495, + "grad_norm": 1.128739833831787, + "learning_rate": 3.70579409844715e-05, + "loss": 0.7234, + "step": 11377 + }, + { + "epoch": 0.16752758044679406, + "grad_norm": 1.152342677116394, + "learning_rate": 3.698430479000865e-05, + "loss": 0.7304, + "step": 11408 + }, + { + "epoch": 0.1679828184371386, + "grad_norm": 1.222424864768982, + "learning_rate": 3.691053335429509e-05, + "loss": 0.7164, + "step": 11439 + }, + { + "epoch": 0.16843805642748316, + "grad_norm": 1.2406268119812012, + "learning_rate": 3.683662750983147e-05, + "loss": 0.7157, + "step": 11470 + }, + { + "epoch": 0.16889329441782772, + "grad_norm": 1.1266759634017944, + "learning_rate": 3.676258809063518e-05, + "loss": 0.7228, + "step": 11501 + }, + { + "epoch": 0.16934853240817227, + "grad_norm": 1.1239662170410156, + "learning_rate": 3.6688415932231004e-05, + "loss": 0.7266, + "step": 11532 + }, + { + "epoch": 0.1698037703985168, + "grad_norm": 1.0813047885894775, + "learning_rate": 3.661411187164166e-05, + "loss": 0.7278, + "step": 11563 + }, + { + "epoch": 0.17025900838886135, + "grad_norm": 56.909889221191406, + "learning_rate": 3.65396767473784e-05, + "loss": 0.7163, + "step": 11594 + }, + { + "epoch": 0.1707142463792059, + "grad_norm": 1.0749647617340088, + "learning_rate": 3.6465111399431465e-05, + "loss": 0.7182, + "step": 11625 + }, + { + "epoch": 0.17116948436955046, + "grad_norm": 1.0705510377883911, + "learning_rate": 3.6390416669260674e-05, + "loss": 0.726, + "step": 11656 + }, + { + "epoch": 0.171624722359895, + "grad_norm": 1.1083126068115234, + "learning_rate": 3.63155933997859e-05, + "loss": 0.7262, + "step": 11687 + }, + { + "epoch": 0.17207996035023956, + "grad_norm": 1.2302770614624023, + "learning_rate": 3.624064243537758e-05, + "loss": 0.7229, + "step": 11718 + }, + { + "epoch": 0.1725351983405841, + "grad_norm": 1.240893006324768, + "learning_rate": 3.616556462184716e-05, + "loss": 0.7228, + "step": 11749 + }, + { + "epoch": 0.17299043633092864, + "grad_norm": 1.1897794008255005, + "learning_rate": 3.609036080643755e-05, + "loss": 0.7178, + "step": 11780 + }, + { + "epoch": 0.1734456743212732, + "grad_norm": 1.244950294494629, + "learning_rate": 3.60150318378136e-05, + "loss": 0.7244, + "step": 11811 + }, + { + "epoch": 0.17390091231161775, + "grad_norm": 1.1689528226852417, + "learning_rate": 3.5939578566052465e-05, + "loss": 0.7181, + "step": 11842 + }, + { + "epoch": 0.1743561503019623, + "grad_norm": 1.1300387382507324, + "learning_rate": 3.586400184263408e-05, + "loss": 0.7176, + "step": 11873 + }, + { + "epoch": 0.17481138829230686, + "grad_norm": 1.0884469747543335, + "learning_rate": 3.578830252043148e-05, + "loss": 0.7182, + "step": 11904 + }, + { + "epoch": 0.17526662628265138, + "grad_norm": 1.119352102279663, + "learning_rate": 3.571248145370125e-05, + "loss": 0.7223, + "step": 11935 + }, + { + "epoch": 0.17572186427299594, + "grad_norm": 1.0979810953140259, + "learning_rate": 3.5636539498073794e-05, + "loss": 0.7233, + "step": 11966 + }, + { + "epoch": 0.1761771022633405, + "grad_norm": 1.138571858406067, + "learning_rate": 3.556047751054378e-05, + "loss": 0.7279, + "step": 11997 + }, + { + "epoch": 0.17663234025368504, + "grad_norm": 1.2011150121688843, + "learning_rate": 3.548429634946039e-05, + "loss": 0.717, + "step": 12028 + }, + { + "epoch": 0.1770875782440296, + "grad_norm": 1.119472861289978, + "learning_rate": 3.540799687451768e-05, + "loss": 0.7227, + "step": 12059 + }, + { + "epoch": 0.17754281623437415, + "grad_norm": 1.089117169380188, + "learning_rate": 3.533157994674485e-05, + "loss": 0.7132, + "step": 12090 + }, + { + "epoch": 0.17799805422471868, + "grad_norm": 1.0937649011611938, + "learning_rate": 3.5255046428496546e-05, + "loss": 0.7285, + "step": 12121 + }, + { + "epoch": 0.17845329221506323, + "grad_norm": 1.0359808206558228, + "learning_rate": 3.517839718344311e-05, + "loss": 0.7178, + "step": 12152 + }, + { + "epoch": 0.17890853020540778, + "grad_norm": 1.1309690475463867, + "learning_rate": 3.510163307656086e-05, + "loss": 0.713, + "step": 12183 + }, + { + "epoch": 0.17936376819575234, + "grad_norm": 1.126597285270691, + "learning_rate": 3.5024754974122324e-05, + "loss": 0.7139, + "step": 12214 + }, + { + "epoch": 0.1798190061860969, + "grad_norm": 1.29429292678833, + "learning_rate": 3.494776374368643e-05, + "loss": 0.7261, + "step": 12245 + }, + { + "epoch": 0.18027424417644144, + "grad_norm": 1.1156132221221924, + "learning_rate": 3.4870660254088724e-05, + "loss": 0.7124, + "step": 12276 + }, + { + "epoch": 0.18072948216678597, + "grad_norm": 1.1548200845718384, + "learning_rate": 3.479344537543164e-05, + "loss": 0.7195, + "step": 12307 + }, + { + "epoch": 0.18118472015713052, + "grad_norm": 1.1318944692611694, + "learning_rate": 3.4716119979074565e-05, + "loss": 0.7215, + "step": 12338 + }, + { + "epoch": 0.18163995814747508, + "grad_norm": 1.1952061653137207, + "learning_rate": 3.463868493762412e-05, + "loss": 0.7159, + "step": 12369 + }, + { + "epoch": 0.18209519613781963, + "grad_norm": 1.1703433990478516, + "learning_rate": 3.456114112492418e-05, + "loss": 0.7145, + "step": 12400 + }, + { + "epoch": 0.18255043412816419, + "grad_norm": 1.065319538116455, + "learning_rate": 3.4483489416046164e-05, + "loss": 0.7108, + "step": 12431 + }, + { + "epoch": 0.18300567211850874, + "grad_norm": 1.278533697128296, + "learning_rate": 3.440573068727905e-05, + "loss": 0.7116, + "step": 12462 + }, + { + "epoch": 0.18346091010885326, + "grad_norm": 1.1642391681671143, + "learning_rate": 3.4327865816119495e-05, + "loss": 0.716, + "step": 12493 + }, + { + "epoch": 0.18391614809919782, + "grad_norm": 1.2108798027038574, + "learning_rate": 3.4249895681262025e-05, + "loss": 0.7194, + "step": 12524 + }, + { + "epoch": 0.18437138608954237, + "grad_norm": 1.085163950920105, + "learning_rate": 3.417182116258899e-05, + "loss": 0.7166, + "step": 12555 + }, + { + "epoch": 0.18482662407988693, + "grad_norm": 1.124064564704895, + "learning_rate": 3.409364314116074e-05, + "loss": 0.7218, + "step": 12586 + }, + { + "epoch": 0.18528186207023148, + "grad_norm": 1.1817457675933838, + "learning_rate": 3.401536249920559e-05, + "loss": 0.7161, + "step": 12617 + }, + { + "epoch": 0.18573710006057603, + "grad_norm": 1.1430065631866455, + "learning_rate": 3.393698012010998e-05, + "loss": 0.7223, + "step": 12648 + }, + { + "epoch": 0.18619233805092056, + "grad_norm": 1.151633620262146, + "learning_rate": 3.385849688840839e-05, + "loss": 0.7181, + "step": 12679 + }, + { + "epoch": 0.1866475760412651, + "grad_norm": 1.1847879886627197, + "learning_rate": 3.3779913689773414e-05, + "loss": 0.7144, + "step": 12710 + }, + { + "epoch": 0.18710281403160967, + "grad_norm": 1.0997257232666016, + "learning_rate": 3.370123141100578e-05, + "loss": 0.7179, + "step": 12741 + }, + { + "epoch": 0.18755805202195422, + "grad_norm": 1.3207893371582031, + "learning_rate": 3.3622450940024305e-05, + "loss": 0.7147, + "step": 12772 + }, + { + "epoch": 0.18801329001229877, + "grad_norm": 1.1414912939071655, + "learning_rate": 3.35435731658559e-05, + "loss": 0.7258, + "step": 12803 + }, + { + "epoch": 0.18846852800264333, + "grad_norm": 1.1708139181137085, + "learning_rate": 3.346459897862552e-05, + "loss": 0.7222, + "step": 12834 + }, + { + "epoch": 0.18892376599298788, + "grad_norm": 1.0472559928894043, + "learning_rate": 3.338552926954613e-05, + "loss": 0.7168, + "step": 12865 + }, + { + "epoch": 0.1893790039833324, + "grad_norm": 1.1117762327194214, + "learning_rate": 3.330636493090868e-05, + "loss": 0.7128, + "step": 12896 + }, + { + "epoch": 0.18983424197367696, + "grad_norm": 1.1351251602172852, + "learning_rate": 3.322710685607193e-05, + "loss": 0.72, + "step": 12927 + }, + { + "epoch": 0.1902894799640215, + "grad_norm": 1.1530694961547852, + "learning_rate": 3.314775593945251e-05, + "loss": 0.7133, + "step": 12958 + }, + { + "epoch": 0.19074471795436607, + "grad_norm": 1.1590766906738281, + "learning_rate": 3.3068313076514714e-05, + "loss": 0.7236, + "step": 12989 + }, + { + "epoch": 0.19119995594471062, + "grad_norm": 1.0657463073730469, + "learning_rate": 3.298877916376047e-05, + "loss": 0.7159, + "step": 13020 + }, + { + "epoch": 0.19165519393505517, + "grad_norm": 1.090234637260437, + "learning_rate": 3.290915509871915e-05, + "loss": 0.7249, + "step": 13051 + }, + { + "epoch": 0.1921104319253997, + "grad_norm": 1.1001083850860596, + "learning_rate": 3.282944177993753e-05, + "loss": 0.721, + "step": 13082 + }, + { + "epoch": 0.19256566991574425, + "grad_norm": 1.0616897344589233, + "learning_rate": 3.274964010696957e-05, + "loss": 0.7145, + "step": 13113 + }, + { + "epoch": 0.1930209079060888, + "grad_norm": 1.05698823928833, + "learning_rate": 3.266975098036629e-05, + "loss": 0.7106, + "step": 13144 + }, + { + "epoch": 0.19347614589643336, + "grad_norm": 1.0747356414794922, + "learning_rate": 3.258977530166562e-05, + "loss": 0.7199, + "step": 13175 + }, + { + "epoch": 0.1939313838867779, + "grad_norm": 1.112629771232605, + "learning_rate": 3.250971397338227e-05, + "loss": 0.7088, + "step": 13206 + }, + { + "epoch": 0.19438662187712247, + "grad_norm": 1.13507878780365, + "learning_rate": 3.2429567898997404e-05, + "loss": 0.7084, + "step": 13237 + }, + { + "epoch": 0.194841859867467, + "grad_norm": 1.137793779373169, + "learning_rate": 3.234933798294859e-05, + "loss": 0.7121, + "step": 13268 + }, + { + "epoch": 0.19529709785781155, + "grad_norm": 1.263027548789978, + "learning_rate": 3.2269025130619535e-05, + "loss": 0.7117, + "step": 13299 + }, + { + "epoch": 0.1957523358481561, + "grad_norm": 1.1799430847167969, + "learning_rate": 3.218863024832985e-05, + "loss": 0.7144, + "step": 13330 + }, + { + "epoch": 0.19620757383850065, + "grad_norm": 1.040297508239746, + "learning_rate": 3.2108154243324864e-05, + "loss": 0.7089, + "step": 13361 + }, + { + "epoch": 0.1966628118288452, + "grad_norm": 1.110816240310669, + "learning_rate": 3.2027598023765345e-05, + "loss": 0.7166, + "step": 13392 + }, + { + "epoch": 0.19711804981918976, + "grad_norm": 1.0399518013000488, + "learning_rate": 3.194696249871729e-05, + "loss": 0.7081, + "step": 13423 + }, + { + "epoch": 0.1975732878095343, + "grad_norm": 1.1946238279342651, + "learning_rate": 3.186624857814164e-05, + "loss": 0.7112, + "step": 13454 + }, + { + "epoch": 0.19802852579987884, + "grad_norm": 1.1289162635803223, + "learning_rate": 3.178545717288401e-05, + "loss": 0.7079, + "step": 13485 + }, + { + "epoch": 0.1984837637902234, + "grad_norm": 1.17491614818573, + "learning_rate": 3.170458919466444e-05, + "loss": 0.7117, + "step": 13516 + }, + { + "epoch": 0.19893900178056795, + "grad_norm": 1.1498847007751465, + "learning_rate": 3.1623645556067063e-05, + "loss": 0.7131, + "step": 13547 + }, + { + "epoch": 0.1993942397709125, + "grad_norm": 1.2516326904296875, + "learning_rate": 3.154262717052985e-05, + "loss": 0.7145, + "step": 13578 + }, + { + "epoch": 0.19984947776125705, + "grad_norm": 1.2112692594528198, + "learning_rate": 3.146153495233426e-05, + "loss": 0.7098, + "step": 13609 + }, + { + "epoch": 0.20030471575160158, + "grad_norm": 0.989539384841919, + "learning_rate": 3.1380369816594944e-05, + "loss": 0.7182, + "step": 13640 + }, + { + "epoch": 0.20075995374194613, + "grad_norm": 1.0849697589874268, + "learning_rate": 3.129913267924946e-05, + "loss": 0.7153, + "step": 13671 + }, + { + "epoch": 0.2012151917322907, + "grad_norm": 1.1133675575256348, + "learning_rate": 3.121782445704782e-05, + "loss": 0.7104, + "step": 13702 + }, + { + "epoch": 0.20167042972263524, + "grad_norm": 1.1086009740829468, + "learning_rate": 3.11364460675423e-05, + "loss": 0.7125, + "step": 13733 + }, + { + "epoch": 0.2021256677129798, + "grad_norm": 1.0160905122756958, + "learning_rate": 3.1054998429076934e-05, + "loss": 0.7055, + "step": 13764 + }, + { + "epoch": 0.20258090570332435, + "grad_norm": 1.0811957120895386, + "learning_rate": 3.097348246077728e-05, + "loss": 0.7105, + "step": 13795 + }, + { + "epoch": 0.20303614369366887, + "grad_norm": 1.0012198686599731, + "learning_rate": 3.0891899082539924e-05, + "loss": 0.7134, + "step": 13826 + }, + { + "epoch": 0.20349138168401343, + "grad_norm": 1.2298318147659302, + "learning_rate": 3.0810249215022233e-05, + "loss": 0.7054, + "step": 13857 + }, + { + "epoch": 0.20394661967435798, + "grad_norm": 1.1324481964111328, + "learning_rate": 3.0728533779631865e-05, + "loss": 0.7023, + "step": 13888 + }, + { + "epoch": 0.20440185766470254, + "grad_norm": 1.1454377174377441, + "learning_rate": 3.064675369851637e-05, + "loss": 0.7124, + "step": 13919 + }, + { + "epoch": 0.2048570956550471, + "grad_norm": 1.0461289882659912, + "learning_rate": 3.056490989455289e-05, + "loss": 0.7159, + "step": 13950 + }, + { + "epoch": 0.20531233364539164, + "grad_norm": 1.0020164251327515, + "learning_rate": 3.0483003291337596e-05, + "loss": 0.702, + "step": 13981 + }, + { + "epoch": 0.20576757163573617, + "grad_norm": 1.0952621698379517, + "learning_rate": 3.040103481317539e-05, + "loss": 0.7108, + "step": 14012 + }, + { + "epoch": 0.20622280962608072, + "grad_norm": 1.129813551902771, + "learning_rate": 3.03190053850694e-05, + "loss": 0.7043, + "step": 14043 + }, + { + "epoch": 0.20667804761642528, + "grad_norm": 1.012137770652771, + "learning_rate": 3.0236915932710573e-05, + "loss": 0.7002, + "step": 14074 + }, + { + "epoch": 0.20713328560676983, + "grad_norm": 1.1774550676345825, + "learning_rate": 3.0154767382467232e-05, + "loss": 0.6996, + "step": 14105 + }, + { + "epoch": 0.20758852359711438, + "grad_norm": 1.0878807306289673, + "learning_rate": 3.0072560661374582e-05, + "loss": 0.7075, + "step": 14136 + }, + { + "epoch": 0.20804376158745894, + "grad_norm": 1.1599992513656616, + "learning_rate": 2.999029669712431e-05, + "loss": 0.7165, + "step": 14167 + }, + { + "epoch": 0.2084989995778035, + "grad_norm": 1.184403419494629, + "learning_rate": 2.990797641805408e-05, + "loss": 0.709, + "step": 14198 + }, + { + "epoch": 0.20895423756814802, + "grad_norm": 1.0857961177825928, + "learning_rate": 2.982560075313704e-05, + "loss": 0.6986, + "step": 14229 + }, + { + "epoch": 0.20940947555849257, + "grad_norm": 1.0922378301620483, + "learning_rate": 2.9743170631971368e-05, + "loss": 0.7085, + "step": 14260 + }, + { + "epoch": 0.20986471354883712, + "grad_norm": 1.0843058824539185, + "learning_rate": 2.9660686984769792e-05, + "loss": 0.7114, + "step": 14291 + }, + { + "epoch": 0.21031995153918168, + "grad_norm": 1.2273341417312622, + "learning_rate": 2.9578150742349047e-05, + "loss": 0.7092, + "step": 14322 + }, + { + "epoch": 0.21077518952952623, + "grad_norm": 1.1041830778121948, + "learning_rate": 2.949556283611942e-05, + "loss": 0.7086, + "step": 14353 + }, + { + "epoch": 0.21123042751987078, + "grad_norm": 1.1019948720932007, + "learning_rate": 2.9412924198074206e-05, + "loss": 0.7095, + "step": 14384 + }, + { + "epoch": 0.2116856655102153, + "grad_norm": 1.1595929861068726, + "learning_rate": 2.9330235760779208e-05, + "loss": 0.7083, + "step": 14415 + }, + { + "epoch": 0.21214090350055986, + "grad_norm": 1.252021312713623, + "learning_rate": 2.9247498457362188e-05, + "loss": 0.7045, + "step": 14446 + }, + { + "epoch": 0.21259614149090442, + "grad_norm": 1.1469565629959106, + "learning_rate": 2.9164713221502373e-05, + "loss": 0.6995, + "step": 14477 + }, + { + "epoch": 0.21305137948124897, + "grad_norm": 1.1796189546585083, + "learning_rate": 2.9081880987419912e-05, + "loss": 0.7101, + "step": 14508 + }, + { + "epoch": 0.21350661747159352, + "grad_norm": 1.183271884918213, + "learning_rate": 2.8999002689865296e-05, + "loss": 0.7074, + "step": 14539 + }, + { + "epoch": 0.21396185546193808, + "grad_norm": 1.2489229440689087, + "learning_rate": 2.8916079264108852e-05, + "loss": 0.7126, + "step": 14570 + }, + { + "epoch": 0.2144170934522826, + "grad_norm": 1.059584379196167, + "learning_rate": 2.883311164593017e-05, + "loss": 0.7045, + "step": 14601 + }, + { + "epoch": 0.21487233144262716, + "grad_norm": 1.1565430164337158, + "learning_rate": 2.875010077160754e-05, + "loss": 0.7075, + "step": 14632 + }, + { + "epoch": 0.2153275694329717, + "grad_norm": 1.0368664264678955, + "learning_rate": 2.866704757790741e-05, + "loss": 0.7065, + "step": 14663 + }, + { + "epoch": 0.21578280742331626, + "grad_norm": 1.1395366191864014, + "learning_rate": 2.858395300207376e-05, + "loss": 0.7029, + "step": 14694 + }, + { + "epoch": 0.21623804541366082, + "grad_norm": 1.077816128730774, + "learning_rate": 2.8500817981817607e-05, + "loss": 0.7078, + "step": 14725 + }, + { + "epoch": 0.21669328340400537, + "grad_norm": 1.2020843029022217, + "learning_rate": 2.8417643455306336e-05, + "loss": 0.7011, + "step": 14756 + }, + { + "epoch": 0.2171485213943499, + "grad_norm": 1.0293958187103271, + "learning_rate": 2.8334430361153185e-05, + "loss": 0.7038, + "step": 14787 + }, + { + "epoch": 0.21760375938469445, + "grad_norm": 1.2163504362106323, + "learning_rate": 2.8251179638406612e-05, + "loss": 0.7029, + "step": 14818 + }, + { + "epoch": 0.218058997375039, + "grad_norm": 1.0222349166870117, + "learning_rate": 2.8167892226539704e-05, + "loss": 0.7084, + "step": 14849 + }, + { + "epoch": 0.21851423536538356, + "grad_norm": 1.0532034635543823, + "learning_rate": 2.8084569065439588e-05, + "loss": 0.7162, + "step": 14880 + }, + { + "epoch": 0.2189694733557281, + "grad_norm": 0.9981489181518555, + "learning_rate": 2.8001211095396807e-05, + "loss": 0.7088, + "step": 14911 + }, + { + "epoch": 0.21942471134607267, + "grad_norm": 1.0804964303970337, + "learning_rate": 2.791781925709473e-05, + "loss": 0.7035, + "step": 14942 + }, + { + "epoch": 0.2198799493364172, + "grad_norm": 1.1326287984848022, + "learning_rate": 2.7834394491598908e-05, + "loss": 0.708, + "step": 14973 + }, + { + "epoch": 0.22033518732676174, + "grad_norm": 1.1164413690567017, + "learning_rate": 2.7750937740346485e-05, + "loss": 0.7047, + "step": 15004 + }, + { + "epoch": 0.2207904253171063, + "grad_norm": 1.1455166339874268, + "learning_rate": 2.7667449945135564e-05, + "loss": 0.708, + "step": 15035 + }, + { + "epoch": 0.22124566330745085, + "grad_norm": 1.1329716444015503, + "learning_rate": 2.7583932048114557e-05, + "loss": 0.6923, + "step": 15066 + }, + { + "epoch": 0.2217009012977954, + "grad_norm": 1.3479692935943604, + "learning_rate": 2.7500384991771587e-05, + "loss": 0.7119, + "step": 15097 + }, + { + "epoch": 0.22215613928813996, + "grad_norm": 1.1160681247711182, + "learning_rate": 2.7416809718923825e-05, + "loss": 0.7031, + "step": 15128 + }, + { + "epoch": 0.22261137727848448, + "grad_norm": 1.1188467741012573, + "learning_rate": 2.7333207172706864e-05, + "loss": 0.7029, + "step": 15159 + }, + { + "epoch": 0.22306661526882904, + "grad_norm": 0.9910608530044556, + "learning_rate": 2.7249578296564088e-05, + "loss": 0.6996, + "step": 15190 + }, + { + "epoch": 0.2235218532591736, + "grad_norm": 1.100176453590393, + "learning_rate": 2.7165924034235973e-05, + "loss": 0.7027, + "step": 15221 + }, + { + "epoch": 0.22397709124951815, + "grad_norm": 1.1848573684692383, + "learning_rate": 2.708224532974953e-05, + "loss": 0.7111, + "step": 15252 + }, + { + "epoch": 0.2244323292398627, + "grad_norm": 1.1293883323669434, + "learning_rate": 2.6998543127407538e-05, + "loss": 0.7045, + "step": 15283 + }, + { + "epoch": 0.22488756723020725, + "grad_norm": 1.1224740743637085, + "learning_rate": 2.6914818371777988e-05, + "loss": 0.7113, + "step": 15314 + }, + { + "epoch": 0.2253428052205518, + "grad_norm": 1.2419655323028564, + "learning_rate": 2.6831072007683373e-05, + "loss": 0.7062, + "step": 15345 + }, + { + "epoch": 0.22579804321089633, + "grad_norm": 1.1906564235687256, + "learning_rate": 2.6747304980190018e-05, + "loss": 0.7112, + "step": 15376 + }, + { + "epoch": 0.22625328120124089, + "grad_norm": 1.0610102415084839, + "learning_rate": 2.6663518234597453e-05, + "loss": 0.7052, + "step": 15407 + }, + { + "epoch": 0.22670851919158544, + "grad_norm": 1.3254245519638062, + "learning_rate": 2.6579712716427696e-05, + "loss": 0.6975, + "step": 15438 + }, + { + "epoch": 0.22716375718193, + "grad_norm": 1.1128469705581665, + "learning_rate": 2.6495889371414652e-05, + "loss": 0.6993, + "step": 15469 + }, + { + "epoch": 0.22761899517227455, + "grad_norm": 1.0977287292480469, + "learning_rate": 2.6412049145493367e-05, + "loss": 0.7001, + "step": 15500 + }, + { + "epoch": 0.2280742331626191, + "grad_norm": 0.9699016213417053, + "learning_rate": 2.632819298478939e-05, + "loss": 0.7082, + "step": 15531 + }, + { + "epoch": 0.22852947115296363, + "grad_norm": 1.1493170261383057, + "learning_rate": 2.6244321835608105e-05, + "loss": 0.7019, + "step": 15562 + }, + { + "epoch": 0.22898470914330818, + "grad_norm": 1.1549670696258545, + "learning_rate": 2.6160436644424024e-05, + "loss": 0.7087, + "step": 15593 + }, + { + "epoch": 0.22943994713365273, + "grad_norm": 1.2285927534103394, + "learning_rate": 2.6076538357870133e-05, + "loss": 0.695, + "step": 15624 + }, + { + "epoch": 0.2298951851239973, + "grad_norm": 1.0625406503677368, + "learning_rate": 2.5992627922727196e-05, + "loss": 0.7072, + "step": 15655 + }, + { + "epoch": 0.23035042311434184, + "grad_norm": 1.2031610012054443, + "learning_rate": 2.5908706285913066e-05, + "loss": 0.6952, + "step": 15686 + }, + { + "epoch": 0.2308056611046864, + "grad_norm": 1.0590460300445557, + "learning_rate": 2.5824774394472008e-05, + "loss": 0.6981, + "step": 15717 + }, + { + "epoch": 0.23126089909503092, + "grad_norm": 1.0085610151290894, + "learning_rate": 2.5740833195563996e-05, + "loss": 0.7006, + "step": 15748 + }, + { + "epoch": 0.23171613708537547, + "grad_norm": 1.1644418239593506, + "learning_rate": 2.5656883636454067e-05, + "loss": 0.7023, + "step": 15779 + }, + { + "epoch": 0.23217137507572003, + "grad_norm": 1.0243310928344727, + "learning_rate": 2.557292666450159e-05, + "loss": 0.7106, + "step": 15810 + }, + { + "epoch": 0.23262661306606458, + "grad_norm": 1.0970982313156128, + "learning_rate": 2.5488963227149566e-05, + "loss": 0.7018, + "step": 15841 + }, + { + "epoch": 0.23308185105640913, + "grad_norm": 1.0774227380752563, + "learning_rate": 2.5404994271913983e-05, + "loss": 0.7058, + "step": 15872 + }, + { + "epoch": 0.2335370890467537, + "grad_norm": 1.2018071413040161, + "learning_rate": 2.5321020746373085e-05, + "loss": 0.7072, + "step": 15903 + }, + { + "epoch": 0.2339923270370982, + "grad_norm": 1.1195406913757324, + "learning_rate": 2.52370435981567e-05, + "loss": 0.6971, + "step": 15934 + }, + { + "epoch": 0.23444756502744277, + "grad_norm": 1.0707147121429443, + "learning_rate": 2.5153063774935533e-05, + "loss": 0.7005, + "step": 15965 + }, + { + "epoch": 0.23490280301778732, + "grad_norm": 1.0488064289093018, + "learning_rate": 2.506908222441045e-05, + "loss": 0.6994, + "step": 15996 + }, + { + "epoch": 0.23535804100813187, + "grad_norm": 1.0699859857559204, + "learning_rate": 2.498509989430187e-05, + "loss": 0.6931, + "step": 16027 + }, + { + "epoch": 0.23581327899847643, + "grad_norm": 1.0461689233779907, + "learning_rate": 2.4901117732338958e-05, + "loss": 0.7022, + "step": 16058 + }, + { + "epoch": 0.23626851698882098, + "grad_norm": 1.096604824066162, + "learning_rate": 2.481713668624899e-05, + "loss": 0.7043, + "step": 16089 + }, + { + "epoch": 0.2367237549791655, + "grad_norm": 1.0687739849090576, + "learning_rate": 2.4733157703746663e-05, + "loss": 0.7043, + "step": 16120 + }, + { + "epoch": 0.23717899296951006, + "grad_norm": 1.1307755708694458, + "learning_rate": 2.4649181732523392e-05, + "loss": 0.7059, + "step": 16151 + }, + { + "epoch": 0.23763423095985461, + "grad_norm": 1.0404301881790161, + "learning_rate": 2.4565209720236582e-05, + "loss": 0.6981, + "step": 16182 + }, + { + "epoch": 0.23808946895019917, + "grad_norm": 1.0836886167526245, + "learning_rate": 2.4481242614498975e-05, + "loss": 0.7145, + "step": 16213 + }, + { + "epoch": 0.23854470694054372, + "grad_norm": 1.0622589588165283, + "learning_rate": 2.439728136286796e-05, + "loss": 0.7069, + "step": 16244 + }, + { + "epoch": 0.23899994493088828, + "grad_norm": 1.1610299348831177, + "learning_rate": 2.4313326912834852e-05, + "loss": 0.7022, + "step": 16275 + }, + { + "epoch": 0.2394551829212328, + "grad_norm": 1.004273772239685, + "learning_rate": 2.4229380211814206e-05, + "loss": 0.6905, + "step": 16306 + }, + { + "epoch": 0.23991042091157735, + "grad_norm": 1.0684071779251099, + "learning_rate": 2.4145442207133124e-05, + "loss": 0.6977, + "step": 16337 + }, + { + "epoch": 0.2403656589019219, + "grad_norm": 0.9177312850952148, + "learning_rate": 2.406151384602059e-05, + "loss": 0.6979, + "step": 16368 + }, + { + "epoch": 0.24082089689226646, + "grad_norm": 1.0734107494354248, + "learning_rate": 2.3977596075596747e-05, + "loss": 0.6987, + "step": 16399 + }, + { + "epoch": 0.24127613488261102, + "grad_norm": 1.1414164304733276, + "learning_rate": 2.3893689842862223e-05, + "loss": 0.6927, + "step": 16430 + }, + { + "epoch": 0.24173137287295557, + "grad_norm": 1.1547383069992065, + "learning_rate": 2.3809796094687475e-05, + "loss": 0.7053, + "step": 16461 + }, + { + "epoch": 0.2421866108633001, + "grad_norm": 1.0909677743911743, + "learning_rate": 2.372591577780202e-05, + "loss": 0.6987, + "step": 16492 + }, + { + "epoch": 0.24264184885364465, + "grad_norm": 1.0706005096435547, + "learning_rate": 2.3642049838783838e-05, + "loss": 0.7014, + "step": 16523 + }, + { + "epoch": 0.2430970868439892, + "grad_norm": 1.0389344692230225, + "learning_rate": 2.3558199224048666e-05, + "loss": 0.695, + "step": 16554 + }, + { + "epoch": 0.24355232483433376, + "grad_norm": 1.0836538076400757, + "learning_rate": 2.347436487983929e-05, + "loss": 0.7004, + "step": 16585 + }, + { + "epoch": 0.2440075628246783, + "grad_norm": 1.0748459100723267, + "learning_rate": 2.3390547752214888e-05, + "loss": 0.7018, + "step": 16616 + }, + { + "epoch": 0.24446280081502286, + "grad_norm": 1.097935438156128, + "learning_rate": 2.330674878704035e-05, + "loss": 0.706, + "step": 16647 + }, + { + "epoch": 0.24491803880536742, + "grad_norm": 1.1082520484924316, + "learning_rate": 2.322296892997561e-05, + "loss": 0.7012, + "step": 16678 + }, + { + "epoch": 0.24537327679571194, + "grad_norm": 1.0682934522628784, + "learning_rate": 2.313920912646497e-05, + "loss": 0.701, + "step": 16709 + }, + { + "epoch": 0.2458285147860565, + "grad_norm": 1.1116893291473389, + "learning_rate": 2.305547032172643e-05, + "loss": 0.7038, + "step": 16740 + }, + { + "epoch": 0.24628375277640105, + "grad_norm": 1.0376949310302734, + "learning_rate": 2.2971753460741014e-05, + "loss": 0.6998, + "step": 16771 + }, + { + "epoch": 0.2467389907667456, + "grad_norm": 1.0389093160629272, + "learning_rate": 2.288805948824212e-05, + "loss": 0.7043, + "step": 16802 + }, + { + "epoch": 0.24719422875709016, + "grad_norm": 1.0645474195480347, + "learning_rate": 2.2804389348704858e-05, + "loss": 0.6947, + "step": 16833 + }, + { + "epoch": 0.2476494667474347, + "grad_norm": 1.0893995761871338, + "learning_rate": 2.2720743986335374e-05, + "loss": 0.6981, + "step": 16864 + }, + { + "epoch": 0.24810470473777924, + "grad_norm": 1.022275447845459, + "learning_rate": 2.2637124345060233e-05, + "loss": 0.7081, + "step": 16895 + }, + { + "epoch": 0.2485599427281238, + "grad_norm": 1.1055867671966553, + "learning_rate": 2.2553531368515695e-05, + "loss": 0.6942, + "step": 16926 + }, + { + "epoch": 0.24901518071846834, + "grad_norm": 1.0815192461013794, + "learning_rate": 2.2469966000037144e-05, + "loss": 0.6951, + "step": 16957 + }, + { + "epoch": 0.2494704187088129, + "grad_norm": 1.0612388849258423, + "learning_rate": 2.2386429182648417e-05, + "loss": 0.6991, + "step": 16988 + }, + { + "epoch": 0.24992565669915745, + "grad_norm": 1.0434961318969727, + "learning_rate": 2.230292185905114e-05, + "loss": 0.6904, + "step": 17019 + }, + { + "epoch": 0.250380894689502, + "grad_norm": 1.0427175760269165, + "learning_rate": 2.2219444971614116e-05, + "loss": 0.6979, + "step": 17050 + }, + { + "epoch": 0.25083613267984656, + "grad_norm": 1.0715687274932861, + "learning_rate": 2.2135999462362655e-05, + "loss": 0.7034, + "step": 17081 + }, + { + "epoch": 0.2512913706701911, + "grad_norm": 1.0116679668426514, + "learning_rate": 2.2052586272968003e-05, + "loss": 0.6996, + "step": 17112 + }, + { + "epoch": 0.25174660866053566, + "grad_norm": 1.2103781700134277, + "learning_rate": 2.196920634473666e-05, + "loss": 0.7026, + "step": 17143 + }, + { + "epoch": 0.2522018466508802, + "grad_norm": 1.0434819459915161, + "learning_rate": 2.1885860618599787e-05, + "loss": 0.6979, + "step": 17174 + }, + { + "epoch": 0.2526570846412247, + "grad_norm": 1.2911967039108276, + "learning_rate": 2.1802550035102577e-05, + "loss": 0.6866, + "step": 17205 + }, + { + "epoch": 0.2531123226315693, + "grad_norm": 1.1720303297042847, + "learning_rate": 2.171927553439363e-05, + "loss": 0.6868, + "step": 17236 + }, + { + "epoch": 0.2535675606219138, + "grad_norm": 1.0302678346633911, + "learning_rate": 2.1636038056214376e-05, + "loss": 0.691, + "step": 17267 + }, + { + "epoch": 0.2540227986122584, + "grad_norm": 1.0190601348876953, + "learning_rate": 2.155283853988844e-05, + "loss": 0.6964, + "step": 17298 + }, + { + "epoch": 0.25447803660260293, + "grad_norm": 1.109703540802002, + "learning_rate": 2.146967792431106e-05, + "loss": 0.693, + "step": 17329 + }, + { + "epoch": 0.25493327459294746, + "grad_norm": 1.160040020942688, + "learning_rate": 2.138655714793849e-05, + "loss": 0.6943, + "step": 17360 + }, + { + "epoch": 0.25538851258329204, + "grad_norm": 1.083268404006958, + "learning_rate": 2.1303477148777367e-05, + "loss": 0.7024, + "step": 17391 + }, + { + "epoch": 0.25584375057363656, + "grad_norm": 1.0631040334701538, + "learning_rate": 2.122043886437421e-05, + "loss": 0.6896, + "step": 17422 + }, + { + "epoch": 0.25629898856398114, + "grad_norm": 1.2141170501708984, + "learning_rate": 2.1137443231804765e-05, + "loss": 0.7005, + "step": 17453 + }, + { + "epoch": 0.25675422655432567, + "grad_norm": 1.082511067390442, + "learning_rate": 2.105449118766347e-05, + "loss": 0.6906, + "step": 17484 + }, + { + "epoch": 0.25720946454467025, + "grad_norm": 0.9919353127479553, + "learning_rate": 2.097158366805287e-05, + "loss": 0.7017, + "step": 17515 + }, + { + "epoch": 0.2576647025350148, + "grad_norm": 1.0450084209442139, + "learning_rate": 2.0888721608573047e-05, + "loss": 0.696, + "step": 17546 + }, + { + "epoch": 0.2581199405253593, + "grad_norm": 1.0460536479949951, + "learning_rate": 2.0805905944311087e-05, + "loss": 0.6947, + "step": 17577 + }, + { + "epoch": 0.2585751785157039, + "grad_norm": 1.0822510719299316, + "learning_rate": 2.0723137609830497e-05, + "loss": 0.7039, + "step": 17608 + }, + { + "epoch": 0.2590304165060484, + "grad_norm": 1.0411216020584106, + "learning_rate": 2.0640417539160686e-05, + "loss": 0.6962, + "step": 17639 + }, + { + "epoch": 0.259485654496393, + "grad_norm": 1.0115315914154053, + "learning_rate": 2.0557746665786427e-05, + "loss": 0.6954, + "step": 17670 + }, + { + "epoch": 0.2599408924867375, + "grad_norm": 1.0552514791488647, + "learning_rate": 2.0475125922637256e-05, + "loss": 0.6881, + "step": 17701 + }, + { + "epoch": 0.26039613047708204, + "grad_norm": 0.9966985583305359, + "learning_rate": 2.0392556242077047e-05, + "loss": 0.7012, + "step": 17732 + }, + { + "epoch": 0.2608513684674266, + "grad_norm": 1.113692045211792, + "learning_rate": 2.031003855589343e-05, + "loss": 0.703, + "step": 17763 + }, + { + "epoch": 0.26130660645777115, + "grad_norm": 1.0169728994369507, + "learning_rate": 2.022757379528727e-05, + "loss": 0.7008, + "step": 17794 + }, + { + "epoch": 0.26176184444811573, + "grad_norm": 1.1313414573669434, + "learning_rate": 2.0145162890862184e-05, + "loss": 0.6956, + "step": 17825 + }, + { + "epoch": 0.26221708243846026, + "grad_norm": 0.9456464052200317, + "learning_rate": 2.0062806772614022e-05, + "loss": 0.694, + "step": 17856 + }, + { + "epoch": 0.26267232042880484, + "grad_norm": 1.0825542211532593, + "learning_rate": 1.9980506369920392e-05, + "loss": 0.6915, + "step": 17887 + }, + { + "epoch": 0.26312755841914937, + "grad_norm": 1.059581995010376, + "learning_rate": 1.989826261153015e-05, + "loss": 0.6922, + "step": 17918 + }, + { + "epoch": 0.2635827964094939, + "grad_norm": 1.0134432315826416, + "learning_rate": 1.9816076425552923e-05, + "loss": 0.6952, + "step": 17949 + }, + { + "epoch": 0.2640380343998385, + "grad_norm": 0.9800439476966858, + "learning_rate": 1.9733948739448676e-05, + "loss": 0.7036, + "step": 17980 + }, + { + "epoch": 0.264493272390183, + "grad_norm": 1.128818392753601, + "learning_rate": 1.9651880480017155e-05, + "loss": 0.6916, + "step": 18011 + }, + { + "epoch": 0.2649485103805276, + "grad_norm": 1.0002161264419556, + "learning_rate": 1.9569872573387516e-05, + "loss": 0.6886, + "step": 18042 + }, + { + "epoch": 0.2654037483708721, + "grad_norm": 1.1037601232528687, + "learning_rate": 1.9487925945007854e-05, + "loss": 0.6954, + "step": 18073 + }, + { + "epoch": 0.2658589863612167, + "grad_norm": 1.0204657316207886, + "learning_rate": 1.9406041519634726e-05, + "loss": 0.6976, + "step": 18104 + }, + { + "epoch": 0.2663142243515612, + "grad_norm": 1.0254517793655396, + "learning_rate": 1.932422022132275e-05, + "loss": 0.697, + "step": 18135 + }, + { + "epoch": 0.26676946234190574, + "grad_norm": 1.0792242288589478, + "learning_rate": 1.924246297341414e-05, + "loss": 0.6932, + "step": 18166 + }, + { + "epoch": 0.2672247003322503, + "grad_norm": 1.2440094947814941, + "learning_rate": 1.9160770698528338e-05, + "loss": 0.6925, + "step": 18197 + }, + { + "epoch": 0.26767993832259485, + "grad_norm": 1.0181853771209717, + "learning_rate": 1.907914431855156e-05, + "loss": 0.6854, + "step": 18228 + }, + { + "epoch": 0.2681351763129394, + "grad_norm": 0.982681930065155, + "learning_rate": 1.8997584754626412e-05, + "loss": 0.6892, + "step": 18259 + }, + { + "epoch": 0.26859041430328395, + "grad_norm": 1.1587820053100586, + "learning_rate": 1.8916092927141486e-05, + "loss": 0.6955, + "step": 18290 + }, + { + "epoch": 0.2690456522936285, + "grad_norm": 1.0297470092773438, + "learning_rate": 1.883466975572098e-05, + "loss": 0.6921, + "step": 18321 + }, + { + "epoch": 0.26950089028397306, + "grad_norm": 1.0646672248840332, + "learning_rate": 1.8753316159214312e-05, + "loss": 0.6966, + "step": 18352 + }, + { + "epoch": 0.2699561282743176, + "grad_norm": 1.0070273876190186, + "learning_rate": 1.8672033055685766e-05, + "loss": 0.7005, + "step": 18383 + }, + { + "epoch": 0.27041136626466217, + "grad_norm": 0.9793278574943542, + "learning_rate": 1.8590821362404116e-05, + "loss": 0.6894, + "step": 18414 + }, + { + "epoch": 0.2708666042550067, + "grad_norm": 1.0349115133285522, + "learning_rate": 1.8509681995832294e-05, + "loss": 0.6906, + "step": 18445 + }, + { + "epoch": 0.2713218422453513, + "grad_norm": 1.0271046161651611, + "learning_rate": 1.8428615871617004e-05, + "loss": 0.6916, + "step": 18476 + }, + { + "epoch": 0.2717770802356958, + "grad_norm": 0.9766640663146973, + "learning_rate": 1.8347623904578448e-05, + "loss": 0.692, + "step": 18507 + }, + { + "epoch": 0.2722323182260403, + "grad_norm": 1.0498918294906616, + "learning_rate": 1.8266707008699975e-05, + "loss": 0.687, + "step": 18538 + }, + { + "epoch": 0.2726875562163849, + "grad_norm": 0.970116138458252, + "learning_rate": 1.818586609711774e-05, + "loss": 0.6923, + "step": 18569 + }, + { + "epoch": 0.27314279420672943, + "grad_norm": 1.1822494268417358, + "learning_rate": 1.8105102082110462e-05, + "loss": 0.6899, + "step": 18600 + }, + { + "epoch": 0.273598032197074, + "grad_norm": 1.0538249015808105, + "learning_rate": 1.8024415875089058e-05, + "loss": 0.6912, + "step": 18631 + }, + { + "epoch": 0.27405327018741854, + "grad_norm": 1.123678207397461, + "learning_rate": 1.7943808386586407e-05, + "loss": 0.6918, + "step": 18662 + }, + { + "epoch": 0.27450850817776307, + "grad_norm": 1.0302077531814575, + "learning_rate": 1.7863280526247073e-05, + "loss": 0.6872, + "step": 18693 + }, + { + "epoch": 0.27496374616810765, + "grad_norm": 1.0867012739181519, + "learning_rate": 1.7782833202817003e-05, + "loss": 0.7006, + "step": 18724 + }, + { + "epoch": 0.2754189841584522, + "grad_norm": 1.0516695976257324, + "learning_rate": 1.7702467324133327e-05, + "loss": 0.6969, + "step": 18755 + }, + { + "epoch": 0.27587422214879675, + "grad_norm": 1.083567500114441, + "learning_rate": 1.7622183797114042e-05, + "loss": 0.6936, + "step": 18786 + }, + { + "epoch": 0.2763294601391413, + "grad_norm": 1.0399643182754517, + "learning_rate": 1.7541983527747838e-05, + "loss": 0.6887, + "step": 18817 + }, + { + "epoch": 0.27678469812948586, + "grad_norm": 1.1514192819595337, + "learning_rate": 1.746186742108387e-05, + "loss": 0.6882, + "step": 18848 + }, + { + "epoch": 0.2772399361198304, + "grad_norm": 1.1234108209609985, + "learning_rate": 1.73818363812215e-05, + "loss": 0.6909, + "step": 18879 + }, + { + "epoch": 0.2776951741101749, + "grad_norm": 1.0432260036468506, + "learning_rate": 1.7301891311300153e-05, + "loss": 0.6826, + "step": 18910 + }, + { + "epoch": 0.2781504121005195, + "grad_norm": 1.2708081007003784, + "learning_rate": 1.7222033113489055e-05, + "loss": 0.694, + "step": 18941 + }, + { + "epoch": 0.278605650090864, + "grad_norm": 0.9991064667701721, + "learning_rate": 1.7142262688977127e-05, + "loss": 0.7001, + "step": 18972 + }, + { + "epoch": 0.2790608880812086, + "grad_norm": 1.103553295135498, + "learning_rate": 1.7062580937962764e-05, + "loss": 0.6974, + "step": 19003 + }, + { + "epoch": 0.27951612607155313, + "grad_norm": 1.0002790689468384, + "learning_rate": 1.698298875964369e-05, + "loss": 0.6951, + "step": 19034 + }, + { + "epoch": 0.27997136406189765, + "grad_norm": 1.0627328157424927, + "learning_rate": 1.690348705220684e-05, + "loss": 0.6914, + "step": 19065 + }, + { + "epoch": 0.28042660205224224, + "grad_norm": 1.152733325958252, + "learning_rate": 1.6824076712818156e-05, + "loss": 0.6909, + "step": 19096 + }, + { + "epoch": 0.28088184004258676, + "grad_norm": 1.1142559051513672, + "learning_rate": 1.6744758637612533e-05, + "loss": 0.6878, + "step": 19127 + }, + { + "epoch": 0.28133707803293134, + "grad_norm": 1.022026538848877, + "learning_rate": 1.6665533721683664e-05, + "loss": 0.6876, + "step": 19158 + }, + { + "epoch": 0.28179231602327587, + "grad_norm": 1.117065668106079, + "learning_rate": 1.6586402859073974e-05, + "loss": 0.6878, + "step": 19189 + }, + { + "epoch": 0.28224755401362045, + "grad_norm": 0.9499729871749878, + "learning_rate": 1.6507366942764463e-05, + "loss": 0.6888, + "step": 19220 + }, + { + "epoch": 0.282702792003965, + "grad_norm": 1.111111044883728, + "learning_rate": 1.6428426864664732e-05, + "loss": 0.6898, + "step": 19251 + }, + { + "epoch": 0.2831580299943095, + "grad_norm": 1.1620928049087524, + "learning_rate": 1.6349583515602816e-05, + "loss": 0.6948, + "step": 19282 + }, + { + "epoch": 0.2836132679846541, + "grad_norm": 1.1431219577789307, + "learning_rate": 1.6270837785315208e-05, + "loss": 0.6929, + "step": 19313 + }, + { + "epoch": 0.2840685059749986, + "grad_norm": 1.1274683475494385, + "learning_rate": 1.619219056243676e-05, + "loss": 0.6823, + "step": 19344 + }, + { + "epoch": 0.2845237439653432, + "grad_norm": 1.1499154567718506, + "learning_rate": 1.6113642734490698e-05, + "loss": 0.6838, + "step": 19375 + }, + { + "epoch": 0.2849789819556877, + "grad_norm": 1.0493180751800537, + "learning_rate": 1.6035195187878577e-05, + "loss": 0.6867, + "step": 19406 + }, + { + "epoch": 0.2854342199460323, + "grad_norm": 0.9728123545646667, + "learning_rate": 1.5956848807870305e-05, + "loss": 0.6889, + "step": 19437 + }, + { + "epoch": 0.2858894579363768, + "grad_norm": 1.0137308835983276, + "learning_rate": 1.587860447859413e-05, + "loss": 0.6892, + "step": 19468 + }, + { + "epoch": 0.28634469592672135, + "grad_norm": 1.0865050554275513, + "learning_rate": 1.5800463083026686e-05, + "loss": 0.6841, + "step": 19499 + }, + { + "epoch": 0.28679993391706593, + "grad_norm": 1.0522550344467163, + "learning_rate": 1.572242550298298e-05, + "loss": 0.6905, + "step": 19530 + }, + { + "epoch": 0.28725517190741046, + "grad_norm": 1.1563197374343872, + "learning_rate": 1.56444926191065e-05, + "loss": 0.6811, + "step": 19561 + }, + { + "epoch": 0.28771040989775504, + "grad_norm": 0.962688684463501, + "learning_rate": 1.5566665310859257e-05, + "loss": 0.6898, + "step": 19592 + }, + { + "epoch": 0.28816564788809956, + "grad_norm": 1.0998531579971313, + "learning_rate": 1.5488944456511846e-05, + "loss": 0.6909, + "step": 19623 + }, + { + "epoch": 0.2886208858784441, + "grad_norm": 1.1609821319580078, + "learning_rate": 1.5411330933133546e-05, + "loss": 0.6844, + "step": 19654 + }, + { + "epoch": 0.28907612386878867, + "grad_norm": 0.9745819568634033, + "learning_rate": 1.533382561658241e-05, + "loss": 0.6933, + "step": 19685 + }, + { + "epoch": 0.2895313618591332, + "grad_norm": 1.085925817489624, + "learning_rate": 1.525642938149541e-05, + "loss": 0.6894, + "step": 19716 + }, + { + "epoch": 0.2899865998494778, + "grad_norm": 1.0314606428146362, + "learning_rate": 1.5179143101278536e-05, + "loss": 0.6965, + "step": 19747 + }, + { + "epoch": 0.2904418378398223, + "grad_norm": 1.0771900415420532, + "learning_rate": 1.5101967648096955e-05, + "loss": 0.6904, + "step": 19778 + }, + { + "epoch": 0.2908970758301669, + "grad_norm": 0.9729062914848328, + "learning_rate": 1.5024903892865172e-05, + "loss": 0.6886, + "step": 19809 + }, + { + "epoch": 0.2913523138205114, + "grad_norm": 1.0824676752090454, + "learning_rate": 1.4947952705237184e-05, + "loss": 0.6804, + "step": 19840 + }, + { + "epoch": 0.29180755181085594, + "grad_norm": 1.0260144472122192, + "learning_rate": 1.4871114953596682e-05, + "loss": 0.6905, + "step": 19871 + }, + { + "epoch": 0.2922627898012005, + "grad_norm": 0.9324101209640503, + "learning_rate": 1.4794391505047256e-05, + "loss": 0.6877, + "step": 19902 + }, + { + "epoch": 0.29271802779154504, + "grad_norm": 1.0553687810897827, + "learning_rate": 1.4717783225402596e-05, + "loss": 0.6928, + "step": 19933 + }, + { + "epoch": 0.2931732657818896, + "grad_norm": 1.129400610923767, + "learning_rate": 1.4641290979176735e-05, + "loss": 0.6905, + "step": 19964 + }, + { + "epoch": 0.29362850377223415, + "grad_norm": 1.064041018486023, + "learning_rate": 1.4564915629574246e-05, + "loss": 0.6936, + "step": 19995 + }, + { + "epoch": 0.2940837417625787, + "grad_norm": 1.116929292678833, + "learning_rate": 1.4488658038480601e-05, + "loss": 0.6818, + "step": 20026 + }, + { + "epoch": 0.29453897975292326, + "grad_norm": 1.0334928035736084, + "learning_rate": 1.4412519066452323e-05, + "loss": 0.6887, + "step": 20057 + }, + { + "epoch": 0.2949942177432678, + "grad_norm": 1.0690734386444092, + "learning_rate": 1.4336499572707373e-05, + "loss": 0.6885, + "step": 20088 + }, + { + "epoch": 0.29544945573361237, + "grad_norm": 1.1211203336715698, + "learning_rate": 1.4260600415115433e-05, + "loss": 0.6919, + "step": 20119 + }, + { + "epoch": 0.2959046937239569, + "grad_norm": 0.9984875917434692, + "learning_rate": 1.4184822450188137e-05, + "loss": 0.6892, + "step": 20150 + }, + { + "epoch": 0.29635993171430147, + "grad_norm": 1.0159475803375244, + "learning_rate": 1.410916653306954e-05, + "loss": 0.682, + "step": 20181 + }, + { + "epoch": 0.296815169704646, + "grad_norm": 0.9778633117675781, + "learning_rate": 1.403363351752639e-05, + "loss": 0.6808, + "step": 20212 + }, + { + "epoch": 0.2972704076949905, + "grad_norm": 1.1207058429718018, + "learning_rate": 1.3958224255938485e-05, + "loss": 0.6852, + "step": 20243 + }, + { + "epoch": 0.2977256456853351, + "grad_norm": 1.0286227464675903, + "learning_rate": 1.388293959928911e-05, + "loss": 0.6886, + "step": 20274 + }, + { + "epoch": 0.29818088367567963, + "grad_norm": 1.0112954378128052, + "learning_rate": 1.3807780397155379e-05, + "loss": 0.6878, + "step": 20305 + }, + { + "epoch": 0.2986361216660242, + "grad_norm": 1.0683724880218506, + "learning_rate": 1.3732747497698655e-05, + "loss": 0.6889, + "step": 20336 + }, + { + "epoch": 0.29909135965636874, + "grad_norm": 1.0744072198867798, + "learning_rate": 1.3657841747655038e-05, + "loss": 0.6791, + "step": 20367 + }, + { + "epoch": 0.2995465976467133, + "grad_norm": 1.0279752016067505, + "learning_rate": 1.3583063992325706e-05, + "loss": 0.684, + "step": 20398 + }, + { + "epoch": 0.30000183563705785, + "grad_norm": 0.9995334148406982, + "learning_rate": 1.3508415075567496e-05, + "loss": 0.6906, + "step": 20429 + }, + { + "epoch": 0.30045707362740237, + "grad_norm": 1.351607322692871, + "learning_rate": 1.343389583978327e-05, + "loss": 0.6964, + "step": 20460 + }, + { + "epoch": 0.30091231161774695, + "grad_norm": 1.0838359594345093, + "learning_rate": 1.3359507125912468e-05, + "loss": 0.6784, + "step": 20491 + }, + { + "epoch": 0.3013675496080915, + "grad_norm": 1.0536307096481323, + "learning_rate": 1.3285249773421627e-05, + "loss": 0.6872, + "step": 20522 + }, + { + "epoch": 0.30182278759843606, + "grad_norm": 0.9636529088020325, + "learning_rate": 1.3211124620294884e-05, + "loss": 0.6914, + "step": 20553 + }, + { + "epoch": 0.3022780255887806, + "grad_norm": 1.1852017641067505, + "learning_rate": 1.313713250302451e-05, + "loss": 0.6821, + "step": 20584 + }, + { + "epoch": 0.3027332635791251, + "grad_norm": 1.072434425354004, + "learning_rate": 1.3063274256601479e-05, + "loss": 0.695, + "step": 20615 + }, + { + "epoch": 0.3031885015694697, + "grad_norm": 1.2345269918441772, + "learning_rate": 1.2989550714506086e-05, + "loss": 0.6824, + "step": 20646 + }, + { + "epoch": 0.3036437395598142, + "grad_norm": 1.0516636371612549, + "learning_rate": 1.291596270869846e-05, + "loss": 0.6854, + "step": 20677 + }, + { + "epoch": 0.3040989775501588, + "grad_norm": 1.0413544178009033, + "learning_rate": 1.284251106960927e-05, + "loss": 0.6895, + "step": 20708 + }, + { + "epoch": 0.3045542155405033, + "grad_norm": 1.158065676689148, + "learning_rate": 1.2769196626130263e-05, + "loss": 0.6861, + "step": 20739 + }, + { + "epoch": 0.3050094535308479, + "grad_norm": 1.0109269618988037, + "learning_rate": 1.2696020205604969e-05, + "loss": 0.6898, + "step": 20770 + }, + { + "epoch": 0.30546469152119243, + "grad_norm": 0.9886858463287354, + "learning_rate": 1.2622982633819359e-05, + "loss": 0.6874, + "step": 20801 + }, + { + "epoch": 0.30591992951153696, + "grad_norm": 1.0234347581863403, + "learning_rate": 1.2550084734992484e-05, + "loss": 0.6823, + "step": 20832 + }, + { + "epoch": 0.30637516750188154, + "grad_norm": 1.028950810432434, + "learning_rate": 1.247732733176724e-05, + "loss": 0.6884, + "step": 20863 + }, + { + "epoch": 0.30683040549222607, + "grad_norm": 1.1941654682159424, + "learning_rate": 1.2404711245201044e-05, + "loss": 0.7019, + "step": 20894 + }, + { + "epoch": 0.30728564348257065, + "grad_norm": 1.0201176404953003, + "learning_rate": 1.2332237294756535e-05, + "loss": 0.6846, + "step": 20925 + }, + { + "epoch": 0.3077408814729152, + "grad_norm": 0.9765841364860535, + "learning_rate": 1.225990629829241e-05, + "loss": 0.6881, + "step": 20956 + }, + { + "epoch": 0.3081961194632597, + "grad_norm": 1.0036793947219849, + "learning_rate": 1.2187719072054136e-05, + "loss": 0.6849, + "step": 20987 + }, + { + "epoch": 0.3086513574536043, + "grad_norm": 1.1151163578033447, + "learning_rate": 1.2115676430664735e-05, + "loss": 0.6825, + "step": 21018 + }, + { + "epoch": 0.3091065954439488, + "grad_norm": 1.0734307765960693, + "learning_rate": 1.2043779187115647e-05, + "loss": 0.6902, + "step": 21049 + }, + { + "epoch": 0.3095618334342934, + "grad_norm": 0.9811964631080627, + "learning_rate": 1.1972028152757476e-05, + "loss": 0.6883, + "step": 21080 + }, + { + "epoch": 0.3100170714246379, + "grad_norm": 1.0949833393096924, + "learning_rate": 1.1900424137290889e-05, + "loss": 0.6873, + "step": 21111 + }, + { + "epoch": 0.3104723094149825, + "grad_norm": 1.0459587574005127, + "learning_rate": 1.1828967948757482e-05, + "loss": 0.6853, + "step": 21142 + }, + { + "epoch": 0.310927547405327, + "grad_norm": 1.1628592014312744, + "learning_rate": 1.175766039353062e-05, + "loss": 0.6837, + "step": 21173 + }, + { + "epoch": 0.31138278539567155, + "grad_norm": 0.9916526079177856, + "learning_rate": 1.1686502276306382e-05, + "loss": 0.6838, + "step": 21204 + }, + { + "epoch": 0.3118380233860161, + "grad_norm": 0.9945309162139893, + "learning_rate": 1.1615494400094445e-05, + "loss": 0.6811, + "step": 21235 + }, + { + "epoch": 0.31229326137636065, + "grad_norm": 1.0234261751174927, + "learning_rate": 1.1544637566209029e-05, + "loss": 0.6833, + "step": 21266 + }, + { + "epoch": 0.31274849936670523, + "grad_norm": 0.999071478843689, + "learning_rate": 1.1473932574259886e-05, + "loss": 0.6858, + "step": 21297 + }, + { + "epoch": 0.31320373735704976, + "grad_norm": 1.0478752851486206, + "learning_rate": 1.1403380222143247e-05, + "loss": 0.6918, + "step": 21328 + }, + { + "epoch": 0.3136589753473943, + "grad_norm": 1.083009958267212, + "learning_rate": 1.1332981306032808e-05, + "loss": 0.6758, + "step": 21359 + }, + { + "epoch": 0.31411421333773887, + "grad_norm": 0.9705089926719666, + "learning_rate": 1.1262736620370762e-05, + "loss": 0.6784, + "step": 21390 + }, + { + "epoch": 0.3145694513280834, + "grad_norm": 0.9727345108985901, + "learning_rate": 1.1192646957858854e-05, + "loss": 0.6902, + "step": 21421 + }, + { + "epoch": 0.315024689318428, + "grad_norm": 1.1719439029693604, + "learning_rate": 1.1122713109449381e-05, + "loss": 0.6802, + "step": 21452 + }, + { + "epoch": 0.3154799273087725, + "grad_norm": 1.061924695968628, + "learning_rate": 1.105293586433634e-05, + "loss": 0.6829, + "step": 21483 + }, + { + "epoch": 0.3159351652991171, + "grad_norm": 0.965242326259613, + "learning_rate": 1.0983316009946446e-05, + "loss": 0.6795, + "step": 21514 + }, + { + "epoch": 0.3163904032894616, + "grad_norm": 1.0916402339935303, + "learning_rate": 1.0913854331930282e-05, + "loss": 0.689, + "step": 21545 + }, + { + "epoch": 0.31684564127980613, + "grad_norm": 1.088815450668335, + "learning_rate": 1.0844551614153456e-05, + "loss": 0.6845, + "step": 21576 + }, + { + "epoch": 0.3173008792701507, + "grad_norm": 1.052106499671936, + "learning_rate": 1.0775408638687725e-05, + "loss": 0.68, + "step": 21607 + }, + { + "epoch": 0.31775611726049524, + "grad_norm": 0.9820737242698669, + "learning_rate": 1.0706426185802165e-05, + "loss": 0.6872, + "step": 21638 + }, + { + "epoch": 0.3182113552508398, + "grad_norm": 1.014233946800232, + "learning_rate": 1.0637605033954371e-05, + "loss": 0.6891, + "step": 21669 + }, + { + "epoch": 0.31866659324118435, + "grad_norm": 1.098426103591919, + "learning_rate": 1.05689459597817e-05, + "loss": 0.6878, + "step": 21700 + }, + { + "epoch": 0.31912183123152893, + "grad_norm": 1.0153820514678955, + "learning_rate": 1.050044973809246e-05, + "loss": 0.6793, + "step": 21731 + }, + { + "epoch": 0.31957706922187346, + "grad_norm": 1.07473886013031, + "learning_rate": 1.043211714185722e-05, + "loss": 0.6892, + "step": 21762 + }, + { + "epoch": 0.320032307212218, + "grad_norm": 1.108799934387207, + "learning_rate": 1.036394894220003e-05, + "loss": 0.6819, + "step": 21793 + }, + { + "epoch": 0.32048754520256256, + "grad_norm": 1.105481743812561, + "learning_rate": 1.0295945908389751e-05, + "loss": 0.6871, + "step": 21824 + }, + { + "epoch": 0.3209427831929071, + "grad_norm": 1.106384038925171, + "learning_rate": 1.0228108807831393e-05, + "loss": 0.6789, + "step": 21855 + }, + { + "epoch": 0.32139802118325167, + "grad_norm": 1.0353236198425293, + "learning_rate": 1.01604384060574e-05, + "loss": 0.6822, + "step": 21886 + }, + { + "epoch": 0.3218532591735962, + "grad_norm": 1.086665153503418, + "learning_rate": 1.009293546671907e-05, + "loss": 0.6804, + "step": 21917 + }, + { + "epoch": 0.3223084971639407, + "grad_norm": 0.982736349105835, + "learning_rate": 1.002560075157791e-05, + "loss": 0.6838, + "step": 21948 + }, + { + "epoch": 0.3227637351542853, + "grad_norm": 1.1834845542907715, + "learning_rate": 9.958435020496995e-06, + "loss": 0.6881, + "step": 21979 + }, + { + "epoch": 0.32321897314462983, + "grad_norm": 1.0267136096954346, + "learning_rate": 9.89143903143249e-06, + "loss": 0.6814, + "step": 22010 + }, + { + "epoch": 0.3236742111349744, + "grad_norm": 1.0347238779067993, + "learning_rate": 9.824613540425038e-06, + "loss": 0.6806, + "step": 22041 + }, + { + "epoch": 0.32412944912531894, + "grad_norm": 1.0527032613754272, + "learning_rate": 9.757959301591197e-06, + "loss": 0.6791, + "step": 22072 + }, + { + "epoch": 0.3245846871156635, + "grad_norm": 1.0493974685668945, + "learning_rate": 9.691477067115017e-06, + "loss": 0.69, + "step": 22103 + }, + { + "epoch": 0.32503992510600804, + "grad_norm": 1.0073857307434082, + "learning_rate": 9.625167587239467e-06, + "loss": 0.6781, + "step": 22134 + }, + { + "epoch": 0.32549516309635257, + "grad_norm": 0.9913318157196045, + "learning_rate": 9.559031610258007e-06, + "loss": 0.6825, + "step": 22165 + }, + { + "epoch": 0.32595040108669715, + "grad_norm": 1.0183086395263672, + "learning_rate": 9.493069882506164e-06, + "loss": 0.6805, + "step": 22196 + }, + { + "epoch": 0.3264056390770417, + "grad_norm": 1.0087958574295044, + "learning_rate": 9.427283148353056e-06, + "loss": 0.6775, + "step": 22227 + }, + { + "epoch": 0.32686087706738626, + "grad_norm": 1.1444545984268188, + "learning_rate": 9.361672150193052e-06, + "loss": 0.684, + "step": 22258 + }, + { + "epoch": 0.3273161150577308, + "grad_norm": 1.0593520402908325, + "learning_rate": 9.29623762843734e-06, + "loss": 0.6874, + "step": 22289 + }, + { + "epoch": 0.3277713530480753, + "grad_norm": 1.0576874017715454, + "learning_rate": 9.230980321505594e-06, + "loss": 0.6789, + "step": 22320 + }, + { + "epoch": 0.3282265910384199, + "grad_norm": 0.9419311285018921, + "learning_rate": 9.165900965817668e-06, + "loss": 0.6727, + "step": 22351 + }, + { + "epoch": 0.3286818290287644, + "grad_norm": 1.116232991218567, + "learning_rate": 9.101000295785245e-06, + "loss": 0.6868, + "step": 22382 + }, + { + "epoch": 0.329137067019109, + "grad_norm": 1.1253653764724731, + "learning_rate": 9.036279043803565e-06, + "loss": 0.6782, + "step": 22413 + }, + { + "epoch": 0.3295923050094535, + "grad_norm": 1.1574054956436157, + "learning_rate": 8.971737940243147e-06, + "loss": 0.6801, + "step": 22444 + }, + { + "epoch": 0.3300475429997981, + "grad_norm": 0.9954296946525574, + "learning_rate": 8.907377713441592e-06, + "loss": 0.6815, + "step": 22475 + }, + { + "epoch": 0.33050278099014263, + "grad_norm": 1.0231496095657349, + "learning_rate": 8.843199089695293e-06, + "loss": 0.6828, + "step": 22506 + }, + { + "epoch": 0.33095801898048716, + "grad_norm": 1.0164254903793335, + "learning_rate": 8.779202793251311e-06, + "loss": 0.6806, + "step": 22537 + }, + { + "epoch": 0.33141325697083174, + "grad_norm": 0.9951460361480713, + "learning_rate": 8.715389546299149e-06, + "loss": 0.6838, + "step": 22568 + }, + { + "epoch": 0.33186849496117626, + "grad_norm": 1.117965817451477, + "learning_rate": 8.651760068962617e-06, + "loss": 0.6726, + "step": 22599 + }, + { + "epoch": 0.33232373295152084, + "grad_norm": 0.9313582181930542, + "learning_rate": 8.588315079291733e-06, + "loss": 0.6807, + "step": 22630 + }, + { + "epoch": 0.33277897094186537, + "grad_norm": 1.0607514381408691, + "learning_rate": 8.52505529325457e-06, + "loss": 0.6757, + "step": 22661 + }, + { + "epoch": 0.3332342089322099, + "grad_norm": 0.9300371408462524, + "learning_rate": 8.461981424729216e-06, + "loss": 0.6715, + "step": 22692 + }, + { + "epoch": 0.3336894469225545, + "grad_norm": 1.0345282554626465, + "learning_rate": 8.399094185495725e-06, + "loss": 0.6814, + "step": 22723 + }, + { + "epoch": 0.334144684912899, + "grad_norm": 1.008501648902893, + "learning_rate": 8.336394285228017e-06, + "loss": 0.6874, + "step": 22754 + }, + { + "epoch": 0.3345999229032436, + "grad_norm": 1.2125533819198608, + "learning_rate": 8.273882431485952e-06, + "loss": 0.6822, + "step": 22785 + }, + { + "epoch": 0.3350551608935881, + "grad_norm": 1.0801039934158325, + "learning_rate": 8.211559329707316e-06, + "loss": 0.6934, + "step": 22816 + }, + { + "epoch": 0.3355103988839327, + "grad_norm": 1.054120659828186, + "learning_rate": 8.149425683199823e-06, + "loss": 0.673, + "step": 22847 + }, + { + "epoch": 0.3359656368742772, + "grad_norm": 1.0891995429992676, + "learning_rate": 8.08748219313325e-06, + "loss": 0.6852, + "step": 22878 + }, + { + "epoch": 0.33642087486462174, + "grad_norm": 1.0153359174728394, + "learning_rate": 8.025729558531453e-06, + "loss": 0.6838, + "step": 22909 + }, + { + "epoch": 0.3368761128549663, + "grad_norm": 1.0803651809692383, + "learning_rate": 7.964168476264508e-06, + "loss": 0.6832, + "step": 22940 + }, + { + "epoch": 0.33733135084531085, + "grad_norm": 1.0524797439575195, + "learning_rate": 7.902799641040884e-06, + "loss": 0.6818, + "step": 22971 + }, + { + "epoch": 0.33778658883565543, + "grad_norm": 1.1119606494903564, + "learning_rate": 7.841623745399523e-06, + "loss": 0.6853, + "step": 23002 + }, + { + "epoch": 0.33824182682599996, + "grad_norm": 0.9859051704406738, + "learning_rate": 7.780641479702114e-06, + "loss": 0.6865, + "step": 23033 + }, + { + "epoch": 0.33869706481634454, + "grad_norm": 0.9103122353553772, + "learning_rate": 7.719853532125227e-06, + "loss": 0.6766, + "step": 23064 + }, + { + "epoch": 0.33915230280668907, + "grad_norm": 1.023934245109558, + "learning_rate": 7.65926058865258e-06, + "loss": 0.6781, + "step": 23095 + }, + { + "epoch": 0.3396075407970336, + "grad_norm": 1.0698734521865845, + "learning_rate": 7.598863333067313e-06, + "loss": 0.6848, + "step": 23126 + }, + { + "epoch": 0.3400627787873782, + "grad_norm": 1.1564706563949585, + "learning_rate": 7.538662446944253e-06, + "loss": 0.6868, + "step": 23157 + }, + { + "epoch": 0.3405180167777227, + "grad_norm": 1.114890456199646, + "learning_rate": 7.478658609642211e-06, + "loss": 0.6785, + "step": 23188 + }, + { + "epoch": 0.3409732547680673, + "grad_norm": 1.1149795055389404, + "learning_rate": 7.418852498296327e-06, + "loss": 0.6706, + "step": 23219 + }, + { + "epoch": 0.3414284927584118, + "grad_norm": 1.0689568519592285, + "learning_rate": 7.359244787810457e-06, + "loss": 0.6751, + "step": 23250 + }, + { + "epoch": 0.34188373074875633, + "grad_norm": 1.1450313329696655, + "learning_rate": 7.299836150849493e-06, + "loss": 0.6708, + "step": 23281 + }, + { + "epoch": 0.3423389687391009, + "grad_norm": 1.1399791240692139, + "learning_rate": 7.240627257831847e-06, + "loss": 0.6781, + "step": 23312 + }, + { + "epoch": 0.34279420672944544, + "grad_norm": 1.131117343902588, + "learning_rate": 7.1816187769218195e-06, + "loss": 0.6767, + "step": 23343 + }, + { + "epoch": 0.34324944471979, + "grad_norm": 1.0069117546081543, + "learning_rate": 7.1228113740220895e-06, + "loss": 0.6795, + "step": 23374 + }, + { + "epoch": 0.34370468271013455, + "grad_norm": 1.0576585531234741, + "learning_rate": 7.064205712766226e-06, + "loss": 0.685, + "step": 23405 + }, + { + "epoch": 0.3441599207004791, + "grad_norm": 1.1254713535308838, + "learning_rate": 7.005802454511129e-06, + "loss": 0.6772, + "step": 23436 + }, + { + "epoch": 0.34461515869082365, + "grad_norm": 1.032549262046814, + "learning_rate": 6.947602258329639e-06, + "loss": 0.6777, + "step": 23467 + }, + { + "epoch": 0.3450703966811682, + "grad_norm": 1.019902229309082, + "learning_rate": 6.889605781003078e-06, + "loss": 0.6814, + "step": 23498 + }, + { + "epoch": 0.34552563467151276, + "grad_norm": 1.0798234939575195, + "learning_rate": 6.831813677013776e-06, + "loss": 0.671, + "step": 23529 + }, + { + "epoch": 0.3459808726618573, + "grad_norm": 1.0329113006591797, + "learning_rate": 6.774226598537792e-06, + "loss": 0.6767, + "step": 23560 + }, + { + "epoch": 0.34643611065220187, + "grad_norm": 1.0345433950424194, + "learning_rate": 6.716845195437482e-06, + "loss": 0.6739, + "step": 23591 + }, + { + "epoch": 0.3468913486425464, + "grad_norm": 0.9708922505378723, + "learning_rate": 6.659670115254168e-06, + "loss": 0.6793, + "step": 23622 + }, + { + "epoch": 0.3473465866328909, + "grad_norm": 1.035715103149414, + "learning_rate": 6.602702003200872e-06, + "loss": 0.6847, + "step": 23653 + }, + { + "epoch": 0.3478018246232355, + "grad_norm": 0.9764015078544617, + "learning_rate": 6.545941502154992e-06, + "loss": 0.6815, + "step": 23684 + }, + { + "epoch": 0.34825706261358, + "grad_norm": 1.033187747001648, + "learning_rate": 6.489389252651057e-06, + "loss": 0.6715, + "step": 23715 + }, + { + "epoch": 0.3487123006039246, + "grad_norm": 1.0181931257247925, + "learning_rate": 6.4330458928735325e-06, + "loss": 0.6758, + "step": 23746 + }, + { + "epoch": 0.34916753859426913, + "grad_norm": 1.0105568170547485, + "learning_rate": 6.376912058649559e-06, + "loss": 0.6746, + "step": 23777 + }, + { + "epoch": 0.3496227765846137, + "grad_norm": 1.0559037923812866, + "learning_rate": 6.320988383441845e-06, + "loss": 0.6795, + "step": 23808 + }, + { + "epoch": 0.35007801457495824, + "grad_norm": 1.0579489469528198, + "learning_rate": 6.265275498341452e-06, + "loss": 0.6812, + "step": 23839 + }, + { + "epoch": 0.35053325256530277, + "grad_norm": 1.0675939321517944, + "learning_rate": 6.209774032060714e-06, + "loss": 0.6769, + "step": 23870 + }, + { + "epoch": 0.35098849055564735, + "grad_norm": 0.8914999961853027, + "learning_rate": 6.1544846109261365e-06, + "loss": 0.6825, + "step": 23901 + }, + { + "epoch": 0.3514437285459919, + "grad_norm": 0.9489038586616516, + "learning_rate": 6.099407858871342e-06, + "loss": 0.6801, + "step": 23932 + }, + { + "epoch": 0.35189896653633645, + "grad_norm": 1.040340781211853, + "learning_rate": 6.044544397429958e-06, + "loss": 0.6813, + "step": 23963 + }, + { + "epoch": 0.352354204526681, + "grad_norm": 1.2235099077224731, + "learning_rate": 5.989894845728708e-06, + "loss": 0.6742, + "step": 23994 + }, + { + "epoch": 0.3528094425170255, + "grad_norm": 1.0165103673934937, + "learning_rate": 5.9354598204803605e-06, + "loss": 0.6768, + "step": 24025 + }, + { + "epoch": 0.3532646805073701, + "grad_norm": 1.023253321647644, + "learning_rate": 5.881239935976762e-06, + "loss": 0.678, + "step": 24056 + }, + { + "epoch": 0.3537199184977146, + "grad_norm": 1.0616728067398071, + "learning_rate": 5.827235804081954e-06, + "loss": 0.6723, + "step": 24087 + }, + { + "epoch": 0.3541751564880592, + "grad_norm": 1.021304726600647, + "learning_rate": 5.773448034225221e-06, + "loss": 0.677, + "step": 24118 + }, + { + "epoch": 0.3546303944784037, + "grad_norm": 1.055176854133606, + "learning_rate": 5.719877233394228e-06, + "loss": 0.6779, + "step": 24149 + }, + { + "epoch": 0.3550856324687483, + "grad_norm": 1.0145021677017212, + "learning_rate": 5.666524006128191e-06, + "loss": 0.679, + "step": 24180 + }, + { + "epoch": 0.35554087045909283, + "grad_norm": 1.0869520902633667, + "learning_rate": 5.613388954511015e-06, + "loss": 0.6728, + "step": 24211 + }, + { + "epoch": 0.35599610844943735, + "grad_norm": 0.9280300140380859, + "learning_rate": 5.560472678164552e-06, + "loss": 0.6794, + "step": 24242 + }, + { + "epoch": 0.35645134643978194, + "grad_norm": 0.9949683547019958, + "learning_rate": 5.507775774241775e-06, + "loss": 0.6771, + "step": 24273 + }, + { + "epoch": 0.35690658443012646, + "grad_norm": 1.0003410577774048, + "learning_rate": 5.4552988374200945e-06, + "loss": 0.6789, + "step": 24304 + }, + { + "epoch": 0.35736182242047104, + "grad_norm": 1.0053478479385376, + "learning_rate": 5.403042459894597e-06, + "loss": 0.6783, + "step": 24335 + }, + { + "epoch": 0.35781706041081557, + "grad_norm": 0.992439866065979, + "learning_rate": 5.3510072313714135e-06, + "loss": 0.6734, + "step": 24366 + }, + { + "epoch": 0.35827229840116015, + "grad_norm": 1.0443801879882812, + "learning_rate": 5.2991937390610205e-06, + "loss": 0.6833, + "step": 24397 + }, + { + "epoch": 0.3587275363915047, + "grad_norm": 0.9766219854354858, + "learning_rate": 5.247602567671625e-06, + "loss": 0.6731, + "step": 24428 + }, + { + "epoch": 0.3591827743818492, + "grad_norm": 1.0202499628067017, + "learning_rate": 5.196234299402603e-06, + "loss": 0.6801, + "step": 24459 + }, + { + "epoch": 0.3596380123721938, + "grad_norm": 1.0573033094406128, + "learning_rate": 5.145089513937865e-06, + "loss": 0.6759, + "step": 24490 + }, + { + "epoch": 0.3600932503625383, + "grad_norm": 1.0492045879364014, + "learning_rate": 5.094168788439369e-06, + "loss": 0.6805, + "step": 24521 + }, + { + "epoch": 0.3605484883528829, + "grad_norm": 0.9806486964225769, + "learning_rate": 5.043472697540594e-06, + "loss": 0.6809, + "step": 24552 + }, + { + "epoch": 0.3610037263432274, + "grad_norm": 1.0643500089645386, + "learning_rate": 4.993001813340012e-06, + "loss": 0.6749, + "step": 24583 + }, + { + "epoch": 0.36145896433357194, + "grad_norm": 1.0298391580581665, + "learning_rate": 4.942756705394702e-06, + "loss": 0.6692, + "step": 24614 + }, + { + "epoch": 0.3619142023239165, + "grad_norm": 1.0897893905639648, + "learning_rate": 4.892737940713884e-06, + "loss": 0.6843, + "step": 24645 + }, + { + "epoch": 0.36236944031426105, + "grad_norm": 1.016927719116211, + "learning_rate": 4.842946083752511e-06, + "loss": 0.6789, + "step": 24676 + }, + { + "epoch": 0.36282467830460563, + "grad_norm": 1.0338464975357056, + "learning_rate": 4.79338169640493e-06, + "loss": 0.6693, + "step": 24707 + }, + { + "epoch": 0.36327991629495016, + "grad_norm": 1.0217540264129639, + "learning_rate": 4.74404533799851e-06, + "loss": 0.6734, + "step": 24738 + }, + { + "epoch": 0.36373515428529474, + "grad_norm": 0.973475992679596, + "learning_rate": 4.694937565287344e-06, + "loss": 0.6716, + "step": 24769 + }, + { + "epoch": 0.36419039227563926, + "grad_norm": 1.0647684335708618, + "learning_rate": 4.646058932445985e-06, + "loss": 0.6731, + "step": 24800 + }, + { + "epoch": 0.3646456302659838, + "grad_norm": 1.0267359018325806, + "learning_rate": 4.597409991063148e-06, + "loss": 0.673, + "step": 24831 + }, + { + "epoch": 0.36510086825632837, + "grad_norm": 1.150754451751709, + "learning_rate": 4.5489912901355375e-06, + "loss": 0.6818, + "step": 24862 + }, + { + "epoch": 0.3655561062466729, + "grad_norm": 1.061081886291504, + "learning_rate": 4.500803376061608e-06, + "loss": 0.6804, + "step": 24893 + }, + { + "epoch": 0.3660113442370175, + "grad_norm": 1.08760404586792, + "learning_rate": 4.45284679263541e-06, + "loss": 0.667, + "step": 24924 + }, + { + "epoch": 0.366466582227362, + "grad_norm": 1.0349477529525757, + "learning_rate": 4.4051220810404775e-06, + "loss": 0.6712, + "step": 24955 + }, + { + "epoch": 0.36692182021770653, + "grad_norm": 1.015647053718567, + "learning_rate": 4.3576297798437025e-06, + "loss": 0.6736, + "step": 24986 + }, + { + "epoch": 0.3673770582080511, + "grad_norm": 1.0646811723709106, + "learning_rate": 4.3103704249892436e-06, + "loss": 0.6758, + "step": 25017 + }, + { + "epoch": 0.36783229619839564, + "grad_norm": 1.0361759662628174, + "learning_rate": 4.263344549792487e-06, + "loss": 0.6768, + "step": 25048 + }, + { + "epoch": 0.3682875341887402, + "grad_norm": 0.9110644459724426, + "learning_rate": 4.216552684934056e-06, + "loss": 0.6826, + "step": 25079 + }, + { + "epoch": 0.36874277217908474, + "grad_norm": 0.9504536390304565, + "learning_rate": 4.169995358453777e-06, + "loss": 0.6848, + "step": 25110 + }, + { + "epoch": 0.3691980101694293, + "grad_norm": 1.049663782119751, + "learning_rate": 4.123673095744757e-06, + "loss": 0.6786, + "step": 25141 + }, + { + "epoch": 0.36965324815977385, + "grad_norm": 1.0623620748519897, + "learning_rate": 4.077586419547435e-06, + "loss": 0.6834, + "step": 25172 + }, + { + "epoch": 0.3701084861501184, + "grad_norm": 1.0130892992019653, + "learning_rate": 4.03173584994368e-06, + "loss": 0.6847, + "step": 25203 + }, + { + "epoch": 0.37056372414046296, + "grad_norm": 1.0924913883209229, + "learning_rate": 3.986121904350948e-06, + "loss": 0.6789, + "step": 25234 + }, + { + "epoch": 0.3710189621308075, + "grad_norm": 1.0095348358154297, + "learning_rate": 3.940745097516407e-06, + "loss": 0.6772, + "step": 25265 + }, + { + "epoch": 0.37147420012115207, + "grad_norm": 1.0151805877685547, + "learning_rate": 3.89560594151116e-06, + "loss": 0.6835, + "step": 25296 + }, + { + "epoch": 0.3719294381114966, + "grad_norm": 1.031278133392334, + "learning_rate": 3.850704945724456e-06, + "loss": 0.6775, + "step": 25327 + }, + { + "epoch": 0.3723846761018411, + "grad_norm": 0.9886388778686523, + "learning_rate": 3.8060426168579077e-06, + "loss": 0.6833, + "step": 25358 + }, + { + "epoch": 0.3728399140921857, + "grad_norm": 1.0071016550064087, + "learning_rate": 3.7616194589198407e-06, + "loss": 0.6763, + "step": 25389 + }, + { + "epoch": 0.3732951520825302, + "grad_norm": 0.986391007900238, + "learning_rate": 3.7174359732195574e-06, + "loss": 0.6753, + "step": 25420 + }, + { + "epoch": 0.3737503900728748, + "grad_norm": 1.0014315843582153, + "learning_rate": 3.673492658361677e-06, + "loss": 0.6811, + "step": 25451 + }, + { + "epoch": 0.37420562806321933, + "grad_norm": 1.0151642560958862, + "learning_rate": 3.6297900102405467e-06, + "loss": 0.6699, + "step": 25482 + }, + { + "epoch": 0.3746608660535639, + "grad_norm": 1.1334686279296875, + "learning_rate": 3.586328522034607e-06, + "loss": 0.6692, + "step": 25513 + }, + { + "epoch": 0.37511610404390844, + "grad_norm": 1.0605874061584473, + "learning_rate": 3.543108684200838e-06, + "loss": 0.6792, + "step": 25544 + }, + { + "epoch": 0.37557134203425296, + "grad_norm": 0.9361720681190491, + "learning_rate": 3.5001309844692464e-06, + "loss": 0.6826, + "step": 25575 + }, + { + "epoch": 0.37602658002459755, + "grad_norm": 1.131210446357727, + "learning_rate": 3.4573959078373215e-06, + "loss": 0.6725, + "step": 25606 + }, + { + "epoch": 0.37648181801494207, + "grad_norm": 1.065705418586731, + "learning_rate": 3.4149039365646063e-06, + "loss": 0.671, + "step": 25637 + }, + { + "epoch": 0.37693705600528665, + "grad_norm": 1.0600756406784058, + "learning_rate": 3.3726555501672143e-06, + "loss": 0.682, + "step": 25668 + }, + { + "epoch": 0.3773922939956312, + "grad_norm": 0.9861871004104614, + "learning_rate": 3.33065122541244e-06, + "loss": 0.6719, + "step": 25699 + }, + { + "epoch": 0.37784753198597576, + "grad_norm": 0.980603039264679, + "learning_rate": 3.288891436313385e-06, + "loss": 0.6784, + "step": 25730 + }, + { + "epoch": 0.3783027699763203, + "grad_norm": 1.0262209177017212, + "learning_rate": 3.2473766541235963e-06, + "loss": 0.6745, + "step": 25761 + }, + { + "epoch": 0.3787580079666648, + "grad_norm": 1.1122559309005737, + "learning_rate": 3.2061073473317466e-06, + "loss": 0.6851, + "step": 25792 + }, + { + "epoch": 0.3792132459570094, + "grad_norm": 0.986608624458313, + "learning_rate": 3.1650839816563444e-06, + "loss": 0.6794, + "step": 25823 + }, + { + "epoch": 0.3796684839473539, + "grad_norm": 1.1190105676651, + "learning_rate": 3.1243070200405093e-06, + "loss": 0.6706, + "step": 25854 + }, + { + "epoch": 0.3801237219376985, + "grad_norm": 0.9959911108016968, + "learning_rate": 3.0837769226467e-06, + "loss": 0.6809, + "step": 25885 + }, + { + "epoch": 0.380578959928043, + "grad_norm": 1.090646505355835, + "learning_rate": 3.0434941468515666e-06, + "loss": 0.677, + "step": 25916 + }, + { + "epoch": 0.38103419791838755, + "grad_norm": 1.1067842245101929, + "learning_rate": 3.003459147240753e-06, + "loss": 0.6751, + "step": 25947 + }, + { + "epoch": 0.38148943590873213, + "grad_norm": 1.0478054285049438, + "learning_rate": 2.9636723756037875e-06, + "loss": 0.6733, + "step": 25978 + }, + { + "epoch": 0.38194467389907666, + "grad_norm": 1.128760576248169, + "learning_rate": 2.9241342809289833e-06, + "loss": 0.6697, + "step": 26009 + }, + { + "epoch": 0.38239991188942124, + "grad_norm": 0.9529085755348206, + "learning_rate": 2.8848453093983594e-06, + "loss": 0.6713, + "step": 26040 + }, + { + "epoch": 0.38285514987976577, + "grad_norm": 1.1507978439331055, + "learning_rate": 2.8458059043826257e-06, + "loss": 0.6737, + "step": 26071 + }, + { + "epoch": 0.38331038787011035, + "grad_norm": 1.010461688041687, + "learning_rate": 2.807016506436172e-06, + "loss": 0.6759, + "step": 26102 + }, + { + "epoch": 0.3837656258604549, + "grad_norm": 0.9800288081169128, + "learning_rate": 2.7684775532920566e-06, + "loss": 0.6766, + "step": 26133 + }, + { + "epoch": 0.3842208638507994, + "grad_norm": 1.0265660285949707, + "learning_rate": 2.7301894798571425e-06, + "loss": 0.6621, + "step": 26164 + }, + { + "epoch": 0.384676101841144, + "grad_norm": 0.9935845136642456, + "learning_rate": 2.6921527182071386e-06, + "loss": 0.6806, + "step": 26195 + }, + { + "epoch": 0.3851313398314885, + "grad_norm": 0.9483484625816345, + "learning_rate": 2.654367697581725e-06, + "loss": 0.6701, + "step": 26226 + }, + { + "epoch": 0.3855865778218331, + "grad_norm": 0.9978191256523132, + "learning_rate": 2.6168348443797175e-06, + "loss": 0.6777, + "step": 26257 + }, + { + "epoch": 0.3860418158121776, + "grad_norm": 1.0558881759643555, + "learning_rate": 2.5795545821542757e-06, + "loss": 0.6717, + "step": 26288 + }, + { + "epoch": 0.38649705380252214, + "grad_norm": 0.9377934336662292, + "learning_rate": 2.54252733160808e-06, + "loss": 0.6664, + "step": 26319 + }, + { + "epoch": 0.3869522917928667, + "grad_norm": 1.0415966510772705, + "learning_rate": 2.5057535105886294e-06, + "loss": 0.6739, + "step": 26350 + }, + { + "epoch": 0.38740752978321125, + "grad_norm": 1.0703872442245483, + "learning_rate": 2.4692335340834953e-06, + "loss": 0.6739, + "step": 26381 + }, + { + "epoch": 0.3878627677735558, + "grad_norm": 1.0336194038391113, + "learning_rate": 2.432967814215639e-06, + "loss": 0.6802, + "step": 26412 + }, + { + "epoch": 0.38831800576390035, + "grad_norm": 1.1104646921157837, + "learning_rate": 2.396956760238794e-06, + "loss": 0.6778, + "step": 26443 + }, + { + "epoch": 0.38877324375424493, + "grad_norm": 0.9827953577041626, + "learning_rate": 2.361200778532796e-06, + "loss": 0.6739, + "step": 26474 + }, + { + "epoch": 0.38922848174458946, + "grad_norm": 1.060691237449646, + "learning_rate": 2.325700272599049e-06, + "loss": 0.6784, + "step": 26505 + }, + { + "epoch": 0.389683719734934, + "grad_norm": 1.0089689493179321, + "learning_rate": 2.2904556430559415e-06, + "loss": 0.6738, + "step": 26536 + }, + { + "epoch": 0.39013895772527857, + "grad_norm": 0.9602312445640564, + "learning_rate": 2.2554672876343106e-06, + "loss": 0.668, + "step": 26567 + }, + { + "epoch": 0.3905941957156231, + "grad_norm": 1.0028860569000244, + "learning_rate": 2.220735601173002e-06, + "loss": 0.6746, + "step": 26598 + }, + { + "epoch": 0.3910494337059677, + "grad_norm": 0.9555149674415588, + "learning_rate": 2.186260975614382e-06, + "loss": 0.6686, + "step": 26629 + }, + { + "epoch": 0.3915046716963122, + "grad_norm": 0.9502089619636536, + "learning_rate": 2.1520437999999034e-06, + "loss": 0.6789, + "step": 26660 + }, + { + "epoch": 0.3919599096866567, + "grad_norm": 0.9777249693870544, + "learning_rate": 2.1180844604657526e-06, + "loss": 0.6732, + "step": 26691 + }, + { + "epoch": 0.3924151476770013, + "grad_norm": 1.0464751720428467, + "learning_rate": 2.084383340238455e-06, + "loss": 0.6809, + "step": 26722 + }, + { + "epoch": 0.39287038566734583, + "grad_norm": 0.981984555721283, + "learning_rate": 2.0509408196305704e-06, + "loss": 0.6839, + "step": 26753 + }, + { + "epoch": 0.3933256236576904, + "grad_norm": 1.3059405088424683, + "learning_rate": 2.017757276036403e-06, + "loss": 0.6841, + "step": 26784 + }, + { + "epoch": 0.39378086164803494, + "grad_norm": 0.9858521223068237, + "learning_rate": 1.984833083927726e-06, + "loss": 0.6747, + "step": 26815 + }, + { + "epoch": 0.3942360996383795, + "grad_norm": 0.9905960559844971, + "learning_rate": 1.952168614849581e-06, + "loss": 0.6706, + "step": 26846 + }, + { + "epoch": 0.39469133762872405, + "grad_norm": 1.011273980140686, + "learning_rate": 1.919764237416058e-06, + "loss": 0.6633, + "step": 26877 + }, + { + "epoch": 0.3951465756190686, + "grad_norm": 0.9586290717124939, + "learning_rate": 1.8876203173061463e-06, + "loss": 0.6758, + "step": 26908 + }, + { + "epoch": 0.39560181360941316, + "grad_norm": 1.0454294681549072, + "learning_rate": 1.8557372172596206e-06, + "loss": 0.6795, + "step": 26939 + }, + { + "epoch": 0.3960570515997577, + "grad_norm": 0.969967246055603, + "learning_rate": 1.8241152970729341e-06, + "loss": 0.6777, + "step": 26970 + }, + { + "epoch": 0.39651228959010226, + "grad_norm": 1.0945801734924316, + "learning_rate": 1.7927549135951572e-06, + "loss": 0.6695, + "step": 27001 + }, + { + "epoch": 0.3969675275804468, + "grad_norm": 1.036529541015625, + "learning_rate": 1.7616564207239477e-06, + "loss": 0.6702, + "step": 27032 + }, + { + "epoch": 0.39742276557079137, + "grad_norm": 1.02977454662323, + "learning_rate": 1.730820169401584e-06, + "loss": 0.6773, + "step": 27063 + }, + { + "epoch": 0.3978780035611359, + "grad_norm": 1.1005606651306152, + "learning_rate": 1.7002465076109558e-06, + "loss": 0.6787, + "step": 27094 + }, + { + "epoch": 0.3983332415514804, + "grad_norm": 1.0047553777694702, + "learning_rate": 1.6699357803716898e-06, + "loss": 0.6787, + "step": 27125 + }, + { + "epoch": 0.398788479541825, + "grad_norm": 1.0358285903930664, + "learning_rate": 1.6398883297362305e-06, + "loss": 0.6747, + "step": 27156 + }, + { + "epoch": 0.39924371753216953, + "grad_norm": 0.9317636489868164, + "learning_rate": 1.6101044947859606e-06, + "loss": 0.6675, + "step": 27187 + }, + { + "epoch": 0.3996989555225141, + "grad_norm": 1.0691051483154297, + "learning_rate": 1.5805846116274114e-06, + "loss": 0.6726, + "step": 27218 + }, + { + "epoch": 0.40015419351285864, + "grad_norm": 1.053666114807129, + "learning_rate": 1.5513290133884611e-06, + "loss": 0.6766, + "step": 27249 + }, + { + "epoch": 0.40060943150320316, + "grad_norm": 1.0425807237625122, + "learning_rate": 1.5223380302145512e-06, + "loss": 0.6673, + "step": 27280 + }, + { + "epoch": 0.40106466949354774, + "grad_norm": 0.9612600207328796, + "learning_rate": 1.4936119892649925e-06, + "loss": 0.6768, + "step": 27311 + }, + { + "epoch": 0.40151990748389227, + "grad_norm": 0.9574009776115417, + "learning_rate": 1.4651512147092482e-06, + "loss": 0.6678, + "step": 27342 + }, + { + "epoch": 0.40197514547423685, + "grad_norm": 0.9669239521026611, + "learning_rate": 1.4369560277232908e-06, + "loss": 0.6805, + "step": 27373 + }, + { + "epoch": 0.4024303834645814, + "grad_norm": 1.0453078746795654, + "learning_rate": 1.409026746485978e-06, + "loss": 0.6714, + "step": 27404 + }, + { + "epoch": 0.40288562145492596, + "grad_norm": 1.1115361452102661, + "learning_rate": 1.3813636861754464e-06, + "loss": 0.671, + "step": 27435 + }, + { + "epoch": 0.4033408594452705, + "grad_norm": 1.0532453060150146, + "learning_rate": 1.3539671589655773e-06, + "loss": 0.6814, + "step": 27466 + }, + { + "epoch": 0.403796097435615, + "grad_norm": 1.0039855241775513, + "learning_rate": 1.3268374740224548e-06, + "loss": 0.6696, + "step": 27497 + }, + { + "epoch": 0.4042513354259596, + "grad_norm": 0.9870678186416626, + "learning_rate": 1.2999749375008807e-06, + "loss": 0.6754, + "step": 27528 + }, + { + "epoch": 0.4047065734163041, + "grad_norm": 0.9539749622344971, + "learning_rate": 1.2733798525409346e-06, + "loss": 0.681, + "step": 27559 + }, + { + "epoch": 0.4051618114066487, + "grad_norm": 1.0031741857528687, + "learning_rate": 1.2470525192645383e-06, + "loss": 0.6715, + "step": 27590 + }, + { + "epoch": 0.4056170493969932, + "grad_norm": 0.9963164329528809, + "learning_rate": 1.2209932347720666e-06, + "loss": 0.6759, + "step": 27621 + }, + { + "epoch": 0.40607228738733775, + "grad_norm": 1.0699001550674438, + "learning_rate": 1.1952022931389972e-06, + "loss": 0.6721, + "step": 27652 + }, + { + "epoch": 0.40652752537768233, + "grad_norm": 1.0070579051971436, + "learning_rate": 1.1696799854126083e-06, + "loss": 0.6779, + "step": 27683 + }, + { + "epoch": 0.40698276336802686, + "grad_norm": 1.043058156967163, + "learning_rate": 1.1444265996086694e-06, + "loss": 0.6744, + "step": 27714 + }, + { + "epoch": 0.40743800135837144, + "grad_norm": 1.0496736764907837, + "learning_rate": 1.119442420708211e-06, + "loss": 0.6678, + "step": 27745 + }, + { + "epoch": 0.40789323934871596, + "grad_norm": 1.027978777885437, + "learning_rate": 1.0947277306542964e-06, + "loss": 0.6705, + "step": 27776 + }, + { + "epoch": 0.40834847733906054, + "grad_norm": 0.9846538305282593, + "learning_rate": 1.0702828083488353e-06, + "loss": 0.6708, + "step": 27807 + }, + { + "epoch": 0.40880371532940507, + "grad_norm": 0.9715524911880493, + "learning_rate": 1.0461079296494647e-06, + "loss": 0.6669, + "step": 27838 + }, + { + "epoch": 0.4092589533197496, + "grad_norm": 0.9620317220687866, + "learning_rate": 1.0222033673663978e-06, + "loss": 0.6714, + "step": 27869 + }, + { + "epoch": 0.4097141913100942, + "grad_norm": 1.0026613473892212, + "learning_rate": 9.985693912593713e-07, + "loss": 0.6788, + "step": 27900 + }, + { + "epoch": 0.4101694293004387, + "grad_norm": 1.0018590688705444, + "learning_rate": 9.752062680346035e-07, + "loss": 0.6817, + "step": 27931 + }, + { + "epoch": 0.4106246672907833, + "grad_norm": 0.9831053018569946, + "learning_rate": 9.521142613417494e-07, + "loss": 0.6728, + "step": 27962 + }, + { + "epoch": 0.4110799052811278, + "grad_norm": 0.9390705227851868, + "learning_rate": 9.292936317709722e-07, + "loss": 0.6728, + "step": 27993 + }, + { + "epoch": 0.41153514327147234, + "grad_norm": 1.0296902656555176, + "learning_rate": 9.067446368499793e-07, + "loss": 0.6844, + "step": 28024 + }, + { + "epoch": 0.4119903812618169, + "grad_norm": 0.9773331880569458, + "learning_rate": 8.844675310411055e-07, + "loss": 0.6708, + "step": 28055 + }, + { + "epoch": 0.41244561925216144, + "grad_norm": 1.08011794090271, + "learning_rate": 8.6246256573847e-07, + "loss": 0.6649, + "step": 28086 + }, + { + "epoch": 0.412900857242506, + "grad_norm": 1.027280569076538, + "learning_rate": 8.407299892651127e-07, + "loss": 0.6728, + "step": 28117 + }, + { + "epoch": 0.41335609523285055, + "grad_norm": 1.0942927598953247, + "learning_rate": 8.19270046870202e-07, + "loss": 0.6785, + "step": 28148 + }, + { + "epoch": 0.41381133322319513, + "grad_norm": 1.000294804573059, + "learning_rate": 7.980829807262752e-07, + "loss": 0.6696, + "step": 28179 + }, + { + "epoch": 0.41426657121353966, + "grad_norm": 1.0161492824554443, + "learning_rate": 7.771690299264889e-07, + "loss": 0.6764, + "step": 28210 + }, + { + "epoch": 0.4147218092038842, + "grad_norm": 0.9774207472801208, + "learning_rate": 7.565284304819426e-07, + "loss": 0.6739, + "step": 28241 + }, + { + "epoch": 0.41517704719422877, + "grad_norm": 1.014341115951538, + "learning_rate": 7.361614153189922e-07, + "loss": 0.6712, + "step": 28272 + }, + { + "epoch": 0.4156322851845733, + "grad_norm": 1.069413661956787, + "learning_rate": 7.160682142766328e-07, + "loss": 0.6767, + "step": 28303 + }, + { + "epoch": 0.4160875231749179, + "grad_norm": 1.0084484815597534, + "learning_rate": 6.962490541039091e-07, + "loss": 0.6696, + "step": 28334 + }, + { + "epoch": 0.4165427611652624, + "grad_norm": 1.0383868217468262, + "learning_rate": 6.767041584573531e-07, + "loss": 0.6693, + "step": 28365 + }, + { + "epoch": 0.416997999155607, + "grad_norm": 1.0503824949264526, + "learning_rate": 6.574337478984532e-07, + "loss": 0.6775, + "step": 28396 + }, + { + "epoch": 0.4174532371459515, + "grad_norm": 0.9797336459159851, + "learning_rate": 6.384380398911732e-07, + "loss": 0.6718, + "step": 28427 + }, + { + "epoch": 0.41790847513629603, + "grad_norm": 1.0050994157791138, + "learning_rate": 6.197172487994951e-07, + "loss": 0.6737, + "step": 28458 + }, + { + "epoch": 0.4183637131266406, + "grad_norm": 0.9689647555351257, + "learning_rate": 6.012715858850021e-07, + "loss": 0.6709, + "step": 28489 + }, + { + "epoch": 0.41881895111698514, + "grad_norm": 1.0405714511871338, + "learning_rate": 5.831012593044971e-07, + "loss": 0.6697, + "step": 28520 + }, + { + "epoch": 0.4192741891073297, + "grad_norm": 1.0198075771331787, + "learning_rate": 5.652064741076435e-07, + "loss": 0.6778, + "step": 28551 + }, + { + "epoch": 0.41972942709767425, + "grad_norm": 1.020678997039795, + "learning_rate": 5.475874322346558e-07, + "loss": 0.6652, + "step": 28582 + }, + { + "epoch": 0.42018466508801877, + "grad_norm": 1.06432044506073, + "learning_rate": 5.30244332514035e-07, + "loss": 0.6754, + "step": 28613 + }, + { + "epoch": 0.42063990307836335, + "grad_norm": 0.9759795665740967, + "learning_rate": 5.131773706602977e-07, + "loss": 0.6733, + "step": 28644 + }, + { + "epoch": 0.4210951410687079, + "grad_norm": 0.9904633164405823, + "learning_rate": 4.963867392717897e-07, + "loss": 0.6752, + "step": 28675 + }, + { + "epoch": 0.42155037905905246, + "grad_norm": 0.9863582849502563, + "learning_rate": 4.798726278285093e-07, + "loss": 0.6732, + "step": 28706 + }, + { + "epoch": 0.422005617049397, + "grad_norm": 1.0036576986312866, + "learning_rate": 4.6363522268995097e-07, + "loss": 0.6794, + "step": 28737 + }, + { + "epoch": 0.42246085503974157, + "grad_norm": 1.0057259798049927, + "learning_rate": 4.4767470709302927e-07, + "loss": 0.673, + "step": 28768 + }, + { + "epoch": 0.4229160930300861, + "grad_norm": 1.0986577272415161, + "learning_rate": 4.319912611499971e-07, + "loss": 0.6808, + "step": 28799 + }, + { + "epoch": 0.4233713310204306, + "grad_norm": 0.9711052775382996, + "learning_rate": 4.1658506184640564e-07, + "loss": 0.6706, + "step": 28830 + }, + { + "epoch": 0.4238265690107752, + "grad_norm": 0.9482130408287048, + "learning_rate": 4.0145628303911996e-07, + "loss": 0.673, + "step": 28861 + }, + { + "epoch": 0.4242818070011197, + "grad_norm": 1.0252598524093628, + "learning_rate": 3.866050954543565e-07, + "loss": 0.6913, + "step": 28892 + }, + { + "epoch": 0.4247370449914643, + "grad_norm": 1.0529813766479492, + "learning_rate": 3.720316666857432e-07, + "loss": 0.673, + "step": 28923 + }, + { + "epoch": 0.42519228298180883, + "grad_norm": 1.034982442855835, + "learning_rate": 3.5773616119244845e-07, + "loss": 0.6732, + "step": 28954 + }, + { + "epoch": 0.42564752097215336, + "grad_norm": 1.0683727264404297, + "learning_rate": 3.437187402973052e-07, + "loss": 0.6717, + "step": 28985 + }, + { + "epoch": 0.42610275896249794, + "grad_norm": 1.0270038843154907, + "learning_rate": 3.2997956218500104e-07, + "loss": 0.6756, + "step": 29016 + }, + { + "epoch": 0.42655799695284247, + "grad_norm": 1.042259931564331, + "learning_rate": 3.165187819003018e-07, + "loss": 0.6778, + "step": 29047 + }, + { + "epoch": 0.42701323494318705, + "grad_norm": 1.0482107400894165, + "learning_rate": 3.033365513462755e-07, + "loss": 0.6694, + "step": 29078 + }, + { + "epoch": 0.4274684729335316, + "grad_norm": 1.0334166288375854, + "learning_rate": 2.9043301928260437e-07, + "loss": 0.6738, + "step": 29109 + }, + { + "epoch": 0.42792371092387615, + "grad_norm": 1.0055501461029053, + "learning_rate": 2.7780833132389773e-07, + "loss": 0.668, + "step": 29140 + }, + { + "epoch": 0.4283789489142207, + "grad_norm": 1.0406752824783325, + "learning_rate": 2.6546262993803473e-07, + "loss": 0.6739, + "step": 29171 + }, + { + "epoch": 0.4288341869045652, + "grad_norm": 0.9942592978477478, + "learning_rate": 2.533960544445879e-07, + "loss": 0.661, + "step": 29202 + }, + { + "epoch": 0.4292894248949098, + "grad_norm": 1.1360158920288086, + "learning_rate": 2.416087410132134e-07, + "loss": 0.678, + "step": 29233 + }, + { + "epoch": 0.4297446628852543, + "grad_norm": 0.9448338150978088, + "learning_rate": 2.301008226621465e-07, + "loss": 0.6691, + "step": 29264 + }, + { + "epoch": 0.4301999008755989, + "grad_norm": 0.9945282340049744, + "learning_rate": 2.1887242925668073e-07, + "loss": 0.6739, + "step": 29295 + }, + { + "epoch": 0.4306551388659434, + "grad_norm": 0.9908316135406494, + "learning_rate": 2.0792368750770785e-07, + "loss": 0.6698, + "step": 29326 + }, + { + "epoch": 0.431110376856288, + "grad_norm": 1.0034847259521484, + "learning_rate": 1.9725472097028851e-07, + "loss": 0.663, + "step": 29357 + }, + { + "epoch": 0.43156561484663253, + "grad_norm": 0.9425103664398193, + "learning_rate": 1.8686565004226718e-07, + "loss": 0.676, + "step": 29388 + }, + { + "epoch": 0.43202085283697705, + "grad_norm": 1.0126384496688843, + "learning_rate": 1.7675659196288995e-07, + "loss": 0.6798, + "step": 29419 + }, + { + "epoch": 0.43247609082732164, + "grad_norm": 0.9344136118888855, + "learning_rate": 1.6692766081150556e-07, + "loss": 0.6664, + "step": 29450 + }, + { + "epoch": 0.43293132881766616, + "grad_norm": 1.0303114652633667, + "learning_rate": 1.5737896750626647e-07, + "loss": 0.6703, + "step": 29481 + }, + { + "epoch": 0.43338656680801074, + "grad_norm": 0.9438189268112183, + "learning_rate": 1.4811061980287976e-07, + "loss": 0.6733, + "step": 29512 + }, + { + "epoch": 0.43384180479835527, + "grad_norm": 1.0802854299545288, + "learning_rate": 1.3912272229338886e-07, + "loss": 0.6731, + "step": 29543 + }, + { + "epoch": 0.4342970427886998, + "grad_norm": 0.9553311467170715, + "learning_rate": 1.3041537640499645e-07, + "loss": 0.6785, + "step": 29574 + }, + { + "epoch": 0.4347522807790444, + "grad_norm": 0.9497290849685669, + "learning_rate": 1.2198868039891564e-07, + "loss": 0.6747, + "step": 29605 + }, + { + "epoch": 0.4352075187693889, + "grad_norm": 0.9549136757850647, + "learning_rate": 1.138427293692651e-07, + "loss": 0.6713, + "step": 29636 + }, + { + "epoch": 0.4356627567597335, + "grad_norm": 0.9680078625679016, + "learning_rate": 1.0597761524199778e-07, + "loss": 0.6775, + "step": 29667 + }, + { + "epoch": 0.436117994750078, + "grad_norm": 0.9711593985557556, + "learning_rate": 9.839342677385455e-08, + "loss": 0.6744, + "step": 29698 + }, + { + "epoch": 0.4365732327404226, + "grad_norm": 0.9899353384971619, + "learning_rate": 9.109024955137325e-08, + "loss": 0.6756, + "step": 29729 + }, + { + "epoch": 0.4370284707307671, + "grad_norm": 1.0864272117614746, + "learning_rate": 8.406816598991729e-08, + "loss": 0.6738, + "step": 29760 + }, + { + "epoch": 0.43748370872111164, + "grad_norm": 0.9642808437347412, + "learning_rate": 7.73272553327431e-08, + "loss": 0.6699, + "step": 29791 + }, + { + "epoch": 0.4379389467114562, + "grad_norm": 1.0026787519454956, + "learning_rate": 7.086759365011186e-08, + "loss": 0.6663, + "step": 29822 + }, + { + "epoch": 0.43839418470180075, + "grad_norm": 0.9774051904678345, + "learning_rate": 6.468925383842639e-08, + "loss": 0.6713, + "step": 29853 + }, + { + "epoch": 0.43884942269214533, + "grad_norm": 1.0789951086044312, + "learning_rate": 5.8792305619415067e-08, + "loss": 0.6684, + "step": 29884 + }, + { + "epoch": 0.43930466068248986, + "grad_norm": 1.0644837617874146, + "learning_rate": 5.317681553933529e-08, + "loss": 0.6763, + "step": 29915 + }, + { + "epoch": 0.4397598986728344, + "grad_norm": 1.1466447114944458, + "learning_rate": 4.78428469682296e-08, + "loss": 0.6726, + "step": 29946 + }, + { + "epoch": 0.44021513666317896, + "grad_norm": 0.9780381917953491, + "learning_rate": 4.2790460099206844e-08, + "loss": 0.6695, + "step": 29977 + }, + { + "epoch": 0.4406703746535235, + "grad_norm": 1.1080350875854492, + "learning_rate": 3.801971194777043e-08, + "loss": 0.6699, + "step": 30008 + }, + { + "epoch": 0.44112561264386807, + "grad_norm": 1.0155872106552124, + "learning_rate": 3.353065635115782e-08, + "loss": 0.6641, + "step": 30039 + }, + { + "epoch": 0.4415808506342126, + "grad_norm": 0.9893083572387695, + "learning_rate": 2.93233439677576e-08, + "loss": 0.6674, + "step": 30070 + }, + { + "epoch": 0.4420360886245572, + "grad_norm": 0.9657506942749023, + "learning_rate": 2.539782227651555e-08, + "loss": 0.6684, + "step": 30101 + }, + { + "epoch": 0.4424913266149017, + "grad_norm": 0.9513658881187439, + "learning_rate": 2.175413557641004e-08, + "loss": 0.6699, + "step": 30132 + }, + { + "epoch": 0.44294656460524623, + "grad_norm": 0.9725571870803833, + "learning_rate": 1.839232498594967e-08, + "loss": 0.6689, + "step": 30163 + }, + { + "epoch": 0.4434018025955908, + "grad_norm": 0.9704023599624634, + "learning_rate": 1.5312428442712522e-08, + "loss": 0.6682, + "step": 30194 + }, + { + "epoch": 0.44385704058593534, + "grad_norm": 1.0558629035949707, + "learning_rate": 1.2514480702913168e-08, + "loss": 0.6706, + "step": 30225 + }, + { + "epoch": 0.4443122785762799, + "grad_norm": 1.1143991947174072, + "learning_rate": 9.998513341005766e-09, + "loss": 0.6711, + "step": 30256 + }, + { + "epoch": 0.44476751656662444, + "grad_norm": 0.945041298866272, + "learning_rate": 7.764554749345454e-09, + "loss": 0.6715, + "step": 30287 + }, + { + "epoch": 0.44522275455696897, + "grad_norm": 0.9509717226028442, + "learning_rate": 5.812630137849717e-09, + "loss": 0.6733, + "step": 30318 + }, + { + "epoch": 0.44567799254731355, + "grad_norm": 1.062523603439331, + "learning_rate": 4.142761533723616e-09, + "loss": 0.6773, + "step": 30349 + }, + { + "epoch": 0.4461332305376581, + "grad_norm": 1.0588343143463135, + "learning_rate": 2.7549677812044317e-09, + "loss": 0.6713, + "step": 30380 + }, + { + "epoch": 0.44658846852800266, + "grad_norm": 1.0594124794006348, + "learning_rate": 1.6492645413590525e-09, + "loss": 0.6697, + "step": 30411 + }, + { + "epoch": 0.4470437065183472, + "grad_norm": 1.0125008821487427, + "learning_rate": 8.256642918980096e-10, + "loss": 0.6728, + "step": 30442 + }, + { + "epoch": 0.44749894450869176, + "grad_norm": 1.038161039352417, + "learning_rate": 2.841763270367004e-10, + "loss": 0.6855, + "step": 30473 + }, + { + "epoch": 0.4479541824990363, + "grad_norm": 1.046329379081726, + "learning_rate": 2.480675739269245e-11, + "loss": 0.6738, + "step": 30504 + } + ], + "logging_steps": 31, + "max_steps": 30517, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 3052, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.251434749612104e+19, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-30517/training_args.bin b/checkpoint-30517/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..dd0c30645908fd59ad9d6091c5e2e08018856b49 --- /dev/null +++ b/checkpoint-30517/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2d2ae676cb4770c8405bd6fc6962f1ef1289d669b20872c9a947682a2673db22 +size 5304 diff --git a/checkpoint-3052/config.json b/checkpoint-3052/config.json new file mode 100644 index 0000000000000000000000000000000000000000..1a2e378a7091e0c0c8eaa10d0bd7796835fe4b64 --- /dev/null +++ b/checkpoint-3052/config.json @@ -0,0 +1,39 @@ +{ + "_name_or_path": "meta-llama/Llama-3.1-8B-Instruct", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 8.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "torch_dtype": "float32", + "transformers_version": "4.44.0.dev0", + "use_cache": true, + "vocab_size": 128256 +} diff --git a/checkpoint-3052/generation_config.json b/checkpoint-3052/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c0d952a3697e90a24460d766a494a8bb9cf80534 --- /dev/null +++ b/checkpoint-3052/generation_config.json @@ -0,0 +1,12 @@ +{ + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.44.0.dev0" +} diff --git a/checkpoint-3052/model-00001-of-00007.safetensors b/checkpoint-3052/model-00001-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..e640e471b9c85b6ded446377d76cca71ccc5ba54 --- /dev/null +++ b/checkpoint-3052/model-00001-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0dbaee4fe2c9b9e71d7eddfbfa3df2e10ffd39477f3db9f6cf0dca9f8c503ebb +size 4886466168 diff --git a/checkpoint-3052/model-00002-of-00007.safetensors b/checkpoint-3052/model-00002-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..fbc8e97904ed5acdba6240901a52d7ea3c73628d --- /dev/null +++ b/checkpoint-3052/model-00002-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:23fa5aaf8ac56c17c821980c9aee44fcbc2a617fc570e875e6fdb93d92886c72 +size 4832007448 diff --git a/checkpoint-3052/model-00003-of-00007.safetensors b/checkpoint-3052/model-00003-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..aeab8b855338147792249baf9c680bfb29ea42a5 --- /dev/null +++ b/checkpoint-3052/model-00003-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a223e87240e50dfb05634ea909cf1ec715d37a12b681b14a3a6e97f28688f2b +size 4999813112 diff --git a/checkpoint-3052/model-00004-of-00007.safetensors b/checkpoint-3052/model-00004-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ac9e7b6b5edbe63eec2dd4c7d3310db1f10fdda9 --- /dev/null +++ b/checkpoint-3052/model-00004-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3047d199c611c6d12b1cca780f033ad89df9c076f742674f7ac4a892ca9573b3 +size 4999813128 diff --git a/checkpoint-3052/model-00005-of-00007.safetensors b/checkpoint-3052/model-00005-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..48718503a952853ffd5e231b9dba74b7026e0e01 --- /dev/null +++ b/checkpoint-3052/model-00005-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a2f91d749d664ecd22c71f3d54488966fc6899a03924b8fea3613c8f83398a8 +size 4832007496 diff --git a/checkpoint-3052/model-00006-of-00007.safetensors b/checkpoint-3052/model-00006-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..e80557548f4abbbf373da73882a384dd5458eeea --- /dev/null +++ b/checkpoint-3052/model-00006-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3596ab0993d98b7821ba4e1c56fd5881c550dcf49c8dd018b44522691d194a85 +size 4999813120 diff --git a/checkpoint-3052/model-00007-of-00007.safetensors b/checkpoint-3052/model-00007-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a96c585bff9b485edeac02fa3b5bde0957a1240c --- /dev/null +++ b/checkpoint-3052/model-00007-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2149ac35dbeb16e566edf8b1498c625a1d9f7bc48111bec5095348ed48346cb8 +size 2571158184 diff --git a/checkpoint-3052/model.safetensors.index.json b/checkpoint-3052/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..0985084320f65a678bea622edb50d3771af4fd13 --- /dev/null +++ b/checkpoint-3052/model.safetensors.index.json @@ -0,0 +1,298 @@ +{ + "metadata": { + "total_size": 32121044992 + }, + "weight_map": { + "lm_head.weight": "model-00007-of-00007.safetensors", + "model.embed_tokens.weight": "model-00001-of-00007.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.10.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.15.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.20.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.21.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.26.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.3.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.30.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.input_layernorm.weight": "model-00007-of-00007.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00007-of-00007.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00007-of-00007.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00007-of-00007.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.4.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.9.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.norm.weight": "model-00007-of-00007.safetensors" + } +} diff --git a/checkpoint-3052/optimizer.pt b/checkpoint-3052/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..e0bf7d9d71c20c5deba2baada50dd9b7add6f2fd --- /dev/null +++ b/checkpoint-3052/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a3f20132c513d46a0930ded82f34f0873d9ff79c2fd58e6d5164ab77dec79016 +size 15385036334 diff --git a/checkpoint-3052/rng_state.pth b/checkpoint-3052/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..897ad19948758b40c785c678c136c4029433a27a --- /dev/null +++ b/checkpoint-3052/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d9cd6a0487226e5bd30d1846894c82af483733ab4381b75bae9c0745e05d405 +size 14244 diff --git a/checkpoint-3052/scheduler.pt b/checkpoint-3052/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..6b2454d919340cd4d989697a74a27016c58dc3aa --- /dev/null +++ b/checkpoint-3052/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8ed9d7fea0b9f468b8c97fd491e0f5a211b8ff197e5f8111c42fc974ecafed4c +size 1064 diff --git a/checkpoint-3052/trainer_state.json b/checkpoint-3052/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..9ae7decab4bcb56d607f6a1035661fc1c39081de --- /dev/null +++ b/checkpoint-3052/trainer_state.json @@ -0,0 +1,719 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.044818914404243994, + "eval_steps": 500, + "global_step": 3052, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0004552379903445491, + "grad_norm": 4.0020222663879395, + "learning_rate": 1.0157273918741808e-06, + "loss": 0.92, + "step": 31 + }, + { + "epoch": 0.0009104759806890982, + "grad_norm": 3.06746506690979, + "learning_rate": 2.0314547837483616e-06, + "loss": 0.8898, + "step": 62 + }, + { + "epoch": 0.0013657139710336473, + "grad_norm": 3.004636764526367, + "learning_rate": 3.0471821756225426e-06, + "loss": 0.8653, + "step": 93 + }, + { + "epoch": 0.0018209519613781964, + "grad_norm": 3.0733370780944824, + "learning_rate": 4.062909567496723e-06, + "loss": 0.8516, + "step": 124 + }, + { + "epoch": 0.0022761899517227454, + "grad_norm": 2.898719549179077, + "learning_rate": 5.078636959370905e-06, + "loss": 0.8461, + "step": 155 + }, + { + "epoch": 0.0027314279420672946, + "grad_norm": 2.8273258209228516, + "learning_rate": 6.094364351245085e-06, + "loss": 0.8362, + "step": 186 + }, + { + "epoch": 0.0031866659324118435, + "grad_norm": 2.9983041286468506, + "learning_rate": 7.110091743119267e-06, + "loss": 0.8274, + "step": 217 + }, + { + "epoch": 0.0036419039227563927, + "grad_norm": 2.9561767578125, + "learning_rate": 8.125819134993446e-06, + "loss": 0.8373, + "step": 248 + }, + { + "epoch": 0.0040971419131009415, + "grad_norm": 2.732726812362671, + "learning_rate": 9.141546526867629e-06, + "loss": 0.8342, + "step": 279 + }, + { + "epoch": 0.004552379903445491, + "grad_norm": 2.8236358165740967, + "learning_rate": 1.015727391874181e-05, + "loss": 0.8387, + "step": 310 + }, + { + "epoch": 0.00500761789379004, + "grad_norm": 2.8730266094207764, + "learning_rate": 1.117300131061599e-05, + "loss": 0.8328, + "step": 341 + }, + { + "epoch": 0.005462855884134589, + "grad_norm": 2.717439651489258, + "learning_rate": 1.218872870249017e-05, + "loss": 0.8316, + "step": 372 + }, + { + "epoch": 0.005918093874479138, + "grad_norm": 2.9625961780548096, + "learning_rate": 1.3204456094364351e-05, + "loss": 0.8295, + "step": 403 + }, + { + "epoch": 0.006373331864823687, + "grad_norm": 2.6649422645568848, + "learning_rate": 1.4220183486238533e-05, + "loss": 0.8356, + "step": 434 + }, + { + "epoch": 0.006828569855168236, + "grad_norm": 2.9066264629364014, + "learning_rate": 1.5235910878112714e-05, + "loss": 0.8313, + "step": 465 + }, + { + "epoch": 0.0072838078455127854, + "grad_norm": 2.7710695266723633, + "learning_rate": 1.6251638269986893e-05, + "loss": 0.8251, + "step": 496 + }, + { + "epoch": 0.007739045835857335, + "grad_norm": 2.416724443435669, + "learning_rate": 1.7267365661861077e-05, + "loss": 0.8319, + "step": 527 + }, + { + "epoch": 0.008194283826201883, + "grad_norm": 2.583005905151367, + "learning_rate": 1.8283093053735257e-05, + "loss": 0.8265, + "step": 558 + }, + { + "epoch": 0.008649521816546433, + "grad_norm": 2.6601760387420654, + "learning_rate": 1.9298820445609438e-05, + "loss": 0.8328, + "step": 589 + }, + { + "epoch": 0.009104759806890982, + "grad_norm": 2.9271795749664307, + "learning_rate": 2.031454783748362e-05, + "loss": 0.8282, + "step": 620 + }, + { + "epoch": 0.00955999779723553, + "grad_norm": 2.3631091117858887, + "learning_rate": 2.13302752293578e-05, + "loss": 0.8314, + "step": 651 + }, + { + "epoch": 0.01001523578758008, + "grad_norm": 2.4002573490142822, + "learning_rate": 2.234600262123198e-05, + "loss": 0.829, + "step": 682 + }, + { + "epoch": 0.010470473777924628, + "grad_norm": 2.629352331161499, + "learning_rate": 2.336173001310616e-05, + "loss": 0.8315, + "step": 713 + }, + { + "epoch": 0.010925711768269179, + "grad_norm": 2.3604094982147217, + "learning_rate": 2.437745740498034e-05, + "loss": 0.8338, + "step": 744 + }, + { + "epoch": 0.011380949758613727, + "grad_norm": 2.8708078861236572, + "learning_rate": 2.5393184796854525e-05, + "loss": 0.843, + "step": 775 + }, + { + "epoch": 0.011836187748958275, + "grad_norm": 2.7403945922851562, + "learning_rate": 2.6408912188728702e-05, + "loss": 0.8372, + "step": 806 + }, + { + "epoch": 0.012291425739302825, + "grad_norm": 2.733816623687744, + "learning_rate": 2.7424639580602886e-05, + "loss": 0.8327, + "step": 837 + }, + { + "epoch": 0.012746663729647374, + "grad_norm": 2.2656960487365723, + "learning_rate": 2.8440366972477066e-05, + "loss": 0.8357, + "step": 868 + }, + { + "epoch": 0.013201901719991924, + "grad_norm": 2.18038010597229, + "learning_rate": 2.9456094364351244e-05, + "loss": 0.8331, + "step": 899 + }, + { + "epoch": 0.013657139710336472, + "grad_norm": 2.497558832168579, + "learning_rate": 3.0471821756225428e-05, + "loss": 0.8271, + "step": 930 + }, + { + "epoch": 0.01411237770068102, + "grad_norm": 2.322632074356079, + "learning_rate": 3.148754914809961e-05, + "loss": 0.8327, + "step": 961 + }, + { + "epoch": 0.014567615691025571, + "grad_norm": 2.5596141815185547, + "learning_rate": 3.2503276539973785e-05, + "loss": 0.8367, + "step": 992 + }, + { + "epoch": 0.01502285368137012, + "grad_norm": 2.1262409687042236, + "learning_rate": 3.351900393184797e-05, + "loss": 0.8192, + "step": 1023 + }, + { + "epoch": 0.01547809167171467, + "grad_norm": 2.2483584880828857, + "learning_rate": 3.453473132372215e-05, + "loss": 0.8352, + "step": 1054 + }, + { + "epoch": 0.015933329662059216, + "grad_norm": 2.224043130874634, + "learning_rate": 3.555045871559633e-05, + "loss": 0.8362, + "step": 1085 + }, + { + "epoch": 0.016388567652403766, + "grad_norm": 2.104788303375244, + "learning_rate": 3.6566186107470514e-05, + "loss": 0.832, + "step": 1116 + }, + { + "epoch": 0.016843805642748316, + "grad_norm": 2.126499652862549, + "learning_rate": 3.7581913499344695e-05, + "loss": 0.8341, + "step": 1147 + }, + { + "epoch": 0.017299043633092866, + "grad_norm": 2.221691370010376, + "learning_rate": 3.8597640891218876e-05, + "loss": 0.8346, + "step": 1178 + }, + { + "epoch": 0.017754281623437413, + "grad_norm": 2.4445159435272217, + "learning_rate": 3.9613368283093056e-05, + "loss": 0.8415, + "step": 1209 + }, + { + "epoch": 0.018209519613781963, + "grad_norm": 2.321516752243042, + "learning_rate": 4.062909567496724e-05, + "loss": 0.8289, + "step": 1240 + }, + { + "epoch": 0.018664757604126513, + "grad_norm": 2.2673656940460205, + "learning_rate": 4.164482306684142e-05, + "loss": 0.8307, + "step": 1271 + }, + { + "epoch": 0.01911999559447106, + "grad_norm": 2.1191513538360596, + "learning_rate": 4.26605504587156e-05, + "loss": 0.8319, + "step": 1302 + }, + { + "epoch": 0.01957523358481561, + "grad_norm": 2.18985652923584, + "learning_rate": 4.367627785058978e-05, + "loss": 0.8398, + "step": 1333 + }, + { + "epoch": 0.02003047157516016, + "grad_norm": 1.9449113607406616, + "learning_rate": 4.469200524246396e-05, + "loss": 0.8304, + "step": 1364 + }, + { + "epoch": 0.020485709565504707, + "grad_norm": 2.0119357109069824, + "learning_rate": 4.570773263433814e-05, + "loss": 0.82, + "step": 1395 + }, + { + "epoch": 0.020940947555849257, + "grad_norm": 2.0013489723205566, + "learning_rate": 4.672346002621232e-05, + "loss": 0.8275, + "step": 1426 + }, + { + "epoch": 0.021396185546193807, + "grad_norm": 2.0175704956054688, + "learning_rate": 4.77391874180865e-05, + "loss": 0.8348, + "step": 1457 + }, + { + "epoch": 0.021851423536538357, + "grad_norm": 2.0555551052093506, + "learning_rate": 4.875491480996068e-05, + "loss": 0.8322, + "step": 1488 + }, + { + "epoch": 0.022306661526882904, + "grad_norm": 1.9871079921722412, + "learning_rate": 4.977064220183487e-05, + "loss": 0.8253, + "step": 1519 + }, + { + "epoch": 0.022761899517227454, + "grad_norm": 2.1991310119628906, + "learning_rate": 4.9999915451558777e-05, + "loss": 0.8252, + "step": 1550 + }, + { + "epoch": 0.023217137507572004, + "grad_norm": 1.96790611743927, + "learning_rate": 4.999955597496219e-05, + "loss": 0.8242, + "step": 1581 + }, + { + "epoch": 0.02367237549791655, + "grad_norm": 2.0572164058685303, + "learning_rate": 4.9998914381774255e-05, + "loss": 0.833, + "step": 1612 + }, + { + "epoch": 0.0241276134882611, + "grad_norm": 2.0381035804748535, + "learning_rate": 4.999799067923527e-05, + "loss": 0.8351, + "step": 1643 + }, + { + "epoch": 0.02458285147860565, + "grad_norm": 2.145235061645508, + "learning_rate": 4.999678487776908e-05, + "loss": 0.842, + "step": 1674 + }, + { + "epoch": 0.025038089468950198, + "grad_norm": 2.0120856761932373, + "learning_rate": 4.9995296990983006e-05, + "loss": 0.838, + "step": 1705 + }, + { + "epoch": 0.025493327459294748, + "grad_norm": 1.9461947679519653, + "learning_rate": 4.999352703566763e-05, + "loss": 0.8269, + "step": 1736 + }, + { + "epoch": 0.025948565449639298, + "grad_norm": 1.9189060926437378, + "learning_rate": 4.999147503179668e-05, + "loss": 0.8214, + "step": 1767 + }, + { + "epoch": 0.026403803439983848, + "grad_norm": 1.906604290008545, + "learning_rate": 4.998914100252672e-05, + "loss": 0.8249, + "step": 1798 + }, + { + "epoch": 0.026859041430328395, + "grad_norm": 2.0810904502868652, + "learning_rate": 4.998652497419696e-05, + "loss": 0.8143, + "step": 1829 + }, + { + "epoch": 0.027314279420672945, + "grad_norm": 1.9607690572738647, + "learning_rate": 4.9983626976328927e-05, + "loss": 0.8155, + "step": 1860 + }, + { + "epoch": 0.027769517411017495, + "grad_norm": 1.8492732048034668, + "learning_rate": 4.998044704162613e-05, + "loss": 0.8168, + "step": 1891 + }, + { + "epoch": 0.02822475540136204, + "grad_norm": 2.0616824626922607, + "learning_rate": 4.9976985205973705e-05, + "loss": 0.8279, + "step": 1922 + }, + { + "epoch": 0.02867999339170659, + "grad_norm": 1.882591724395752, + "learning_rate": 4.997324150843799e-05, + "loss": 0.8199, + "step": 1953 + }, + { + "epoch": 0.029135231382051142, + "grad_norm": 1.8001904487609863, + "learning_rate": 4.99692159912661e-05, + "loss": 0.8232, + "step": 1984 + }, + { + "epoch": 0.02959046937239569, + "grad_norm": 1.94768226146698, + "learning_rate": 4.996490869988546e-05, + "loss": 0.817, + "step": 2015 + }, + { + "epoch": 0.03004570736274024, + "grad_norm": 1.845452904701233, + "learning_rate": 4.996031968290326e-05, + "loss": 0.8146, + "step": 2046 + }, + { + "epoch": 0.03050094535308479, + "grad_norm": 1.8649152517318726, + "learning_rate": 4.995544899210594e-05, + "loss": 0.8246, + "step": 2077 + }, + { + "epoch": 0.03095618334342934, + "grad_norm": 1.665781021118164, + "learning_rate": 4.9950296682458583e-05, + "loss": 0.8004, + "step": 2108 + }, + { + "epoch": 0.031411421333773885, + "grad_norm": 1.6481060981750488, + "learning_rate": 4.994486281210429e-05, + "loss": 0.7949, + "step": 2139 + }, + { + "epoch": 0.03186665932411843, + "grad_norm": 1.7481175661087036, + "learning_rate": 4.9939147442363566e-05, + "loss": 0.8142, + "step": 2170 + }, + { + "epoch": 0.032321897314462986, + "grad_norm": 1.6130069494247437, + "learning_rate": 4.9933150637733574e-05, + "loss": 0.8126, + "step": 2201 + }, + { + "epoch": 0.03277713530480753, + "grad_norm": 1.7536373138427734, + "learning_rate": 4.992687246588743e-05, + "loss": 0.8149, + "step": 2232 + }, + { + "epoch": 0.033232373295152086, + "grad_norm": 1.721200942993164, + "learning_rate": 4.992031299767347e-05, + "loss": 0.8051, + "step": 2263 + }, + { + "epoch": 0.03368761128549663, + "grad_norm": 3.3350446224212646, + "learning_rate": 4.9913472307114386e-05, + "loss": 0.8007, + "step": 2294 + }, + { + "epoch": 0.03414284927584118, + "grad_norm": 1.848281741142273, + "learning_rate": 4.9906350471406446e-05, + "loss": 0.8076, + "step": 2325 + }, + { + "epoch": 0.03459808726618573, + "grad_norm": 1.6283164024353027, + "learning_rate": 4.989894757091861e-05, + "loss": 0.8084, + "step": 2356 + }, + { + "epoch": 0.03505332525653028, + "grad_norm": 1.6567096710205078, + "learning_rate": 4.989126368919158e-05, + "loss": 0.8075, + "step": 2387 + }, + { + "epoch": 0.035508563246874826, + "grad_norm": 1.6488817930221558, + "learning_rate": 4.988329891293693e-05, + "loss": 0.7994, + "step": 2418 + }, + { + "epoch": 0.03596380123721938, + "grad_norm": 1.9465230703353882, + "learning_rate": 4.987505333203608e-05, + "loss": 0.8064, + "step": 2449 + }, + { + "epoch": 0.036419039227563926, + "grad_norm": 1.722517967224121, + "learning_rate": 4.9866527039539276e-05, + "loss": 0.8005, + "step": 2480 + }, + { + "epoch": 0.03687427721790847, + "grad_norm": 1.8509281873703003, + "learning_rate": 4.9857720131664594e-05, + "loss": 0.8094, + "step": 2511 + }, + { + "epoch": 0.03732951520825303, + "grad_norm": 1.7140249013900757, + "learning_rate": 4.9848632707796773e-05, + "loss": 0.8018, + "step": 2542 + }, + { + "epoch": 0.03778475319859757, + "grad_norm": 1.9315160512924194, + "learning_rate": 4.9839264870486155e-05, + "loss": 0.8094, + "step": 2573 + }, + { + "epoch": 0.03823999118894212, + "grad_norm": 1.728063702583313, + "learning_rate": 4.9829616725447526e-05, + "loss": 0.8052, + "step": 2604 + }, + { + "epoch": 0.03869522917928667, + "grad_norm": 1.588205099105835, + "learning_rate": 4.981968838155888e-05, + "loss": 0.7983, + "step": 2635 + }, + { + "epoch": 0.03915046716963122, + "grad_norm": 1.6086421012878418, + "learning_rate": 4.980947995086024e-05, + "loss": 0.7961, + "step": 2666 + }, + { + "epoch": 0.03960570515997577, + "grad_norm": 1.680635929107666, + "learning_rate": 4.979899154855234e-05, + "loss": 0.8033, + "step": 2697 + }, + { + "epoch": 0.04006094315032032, + "grad_norm": 1.6833465099334717, + "learning_rate": 4.9788223292995386e-05, + "loss": 0.793, + "step": 2728 + }, + { + "epoch": 0.04051618114066487, + "grad_norm": 1.5738922357559204, + "learning_rate": 4.977717530570768e-05, + "loss": 0.7837, + "step": 2759 + }, + { + "epoch": 0.040971419131009414, + "grad_norm": 1.6513015031814575, + "learning_rate": 4.976584771136425e-05, + "loss": 0.7969, + "step": 2790 + }, + { + "epoch": 0.04142665712135397, + "grad_norm": 1.7180182933807373, + "learning_rate": 4.975424063779547e-05, + "loss": 0.7994, + "step": 2821 + }, + { + "epoch": 0.041881895111698514, + "grad_norm": 1.5236577987670898, + "learning_rate": 4.974235421598557e-05, + "loss": 0.7952, + "step": 2852 + }, + { + "epoch": 0.04233713310204307, + "grad_norm": 1.760188341140747, + "learning_rate": 4.973018858007122e-05, + "loss": 0.795, + "step": 2883 + }, + { + "epoch": 0.042792371092387614, + "grad_norm": 1.5314075946807861, + "learning_rate": 4.9717743867339963e-05, + "loss": 0.7884, + "step": 2914 + }, + { + "epoch": 0.04324760908273216, + "grad_norm": 1.7256252765655518, + "learning_rate": 4.9705020218228695e-05, + "loss": 0.7883, + "step": 2945 + }, + { + "epoch": 0.043702847073076714, + "grad_norm": 1.6262825727462769, + "learning_rate": 4.969201777632205e-05, + "loss": 0.8007, + "step": 2976 + }, + { + "epoch": 0.04415808506342126, + "grad_norm": 1.4939130544662476, + "learning_rate": 4.9678736688350846e-05, + "loss": 0.7862, + "step": 3007 + }, + { + "epoch": 0.04461332305376581, + "grad_norm": 1.6776609420776367, + "learning_rate": 4.966517710419033e-05, + "loss": 0.7977, + "step": 3038 + } + ], + "logging_steps": 31, + "max_steps": 30517, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 3052, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.251656078846591e+18, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-3052/training_args.bin b/checkpoint-3052/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..dd0c30645908fd59ad9d6091c5e2e08018856b49 --- /dev/null +++ b/checkpoint-3052/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2d2ae676cb4770c8405bd6fc6962f1ef1289d669b20872c9a947682a2673db22 +size 5304 diff --git a/checkpoint-6104/config.json b/checkpoint-6104/config.json new file mode 100644 index 0000000000000000000000000000000000000000..1a2e378a7091e0c0c8eaa10d0bd7796835fe4b64 --- /dev/null +++ b/checkpoint-6104/config.json @@ -0,0 +1,39 @@ +{ + "_name_or_path": "meta-llama/Llama-3.1-8B-Instruct", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 8.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "torch_dtype": "float32", + "transformers_version": "4.44.0.dev0", + "use_cache": true, + "vocab_size": 128256 +} diff --git a/checkpoint-6104/generation_config.json b/checkpoint-6104/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c0d952a3697e90a24460d766a494a8bb9cf80534 --- /dev/null +++ b/checkpoint-6104/generation_config.json @@ -0,0 +1,12 @@ +{ + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.44.0.dev0" +} diff --git a/checkpoint-6104/model-00001-of-00007.safetensors b/checkpoint-6104/model-00001-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..832b2305d11390e6e407dece2cb26703da657233 --- /dev/null +++ b/checkpoint-6104/model-00001-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec4598b05e4f1b33be87bd140c8665fe3a0d22b50f721212bcb46a05537c68b6 +size 4886466168 diff --git a/checkpoint-6104/model-00002-of-00007.safetensors b/checkpoint-6104/model-00002-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..fbc8e97904ed5acdba6240901a52d7ea3c73628d --- /dev/null +++ b/checkpoint-6104/model-00002-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:23fa5aaf8ac56c17c821980c9aee44fcbc2a617fc570e875e6fdb93d92886c72 +size 4832007448 diff --git a/checkpoint-6104/model-00003-of-00007.safetensors b/checkpoint-6104/model-00003-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..aeab8b855338147792249baf9c680bfb29ea42a5 --- /dev/null +++ b/checkpoint-6104/model-00003-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a223e87240e50dfb05634ea909cf1ec715d37a12b681b14a3a6e97f28688f2b +size 4999813112 diff --git a/checkpoint-6104/model-00004-of-00007.safetensors b/checkpoint-6104/model-00004-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ac9e7b6b5edbe63eec2dd4c7d3310db1f10fdda9 --- /dev/null +++ b/checkpoint-6104/model-00004-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3047d199c611c6d12b1cca780f033ad89df9c076f742674f7ac4a892ca9573b3 +size 4999813128 diff --git a/checkpoint-6104/model-00005-of-00007.safetensors b/checkpoint-6104/model-00005-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..48718503a952853ffd5e231b9dba74b7026e0e01 --- /dev/null +++ b/checkpoint-6104/model-00005-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a2f91d749d664ecd22c71f3d54488966fc6899a03924b8fea3613c8f83398a8 +size 4832007496 diff --git a/checkpoint-6104/model-00006-of-00007.safetensors b/checkpoint-6104/model-00006-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..cfc160839b4c4161dd6233b195baf31143ddc33f --- /dev/null +++ b/checkpoint-6104/model-00006-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb4e7292906bc37c443792fc7fb15db5c4bd17f84c3c8dddce3eff6f39181d47 +size 4999813120 diff --git a/checkpoint-6104/model-00007-of-00007.safetensors b/checkpoint-6104/model-00007-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..bf112ef59d8c3feb33be179d714fe5fc4bd17ef2 --- /dev/null +++ b/checkpoint-6104/model-00007-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2e7bcb0bda96a9cdcc61240b868bae7181f82e6367a24fcc658f40c305a21d10 +size 2571158184 diff --git a/checkpoint-6104/model.safetensors.index.json b/checkpoint-6104/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..0985084320f65a678bea622edb50d3771af4fd13 --- /dev/null +++ b/checkpoint-6104/model.safetensors.index.json @@ -0,0 +1,298 @@ +{ + "metadata": { + "total_size": 32121044992 + }, + "weight_map": { + "lm_head.weight": "model-00007-of-00007.safetensors", + "model.embed_tokens.weight": "model-00001-of-00007.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.10.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.15.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.20.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.21.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.26.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.3.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.30.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.input_layernorm.weight": "model-00007-of-00007.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00007-of-00007.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00007-of-00007.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00007-of-00007.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.4.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.9.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.norm.weight": "model-00007-of-00007.safetensors" + } +} diff --git a/checkpoint-6104/optimizer.pt b/checkpoint-6104/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..235a624834aa33bbf3c76178bef974efd0319afe --- /dev/null +++ b/checkpoint-6104/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:71a73cbc63a3d350e7fd3700b462d68c0dc76928820baab6eb9dcae0f285dab4 +size 15385036334 diff --git a/checkpoint-6104/rng_state.pth b/checkpoint-6104/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..897ad19948758b40c785c678c136c4029433a27a --- /dev/null +++ b/checkpoint-6104/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d9cd6a0487226e5bd30d1846894c82af483733ab4381b75bae9c0745e05d405 +size 14244 diff --git a/checkpoint-6104/scheduler.pt b/checkpoint-6104/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..5f5e5514ba898102fcdb5731bba1ae1c2957e6e5 --- /dev/null +++ b/checkpoint-6104/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:107e0617754026d870a7da422dabb716a8dc7d3a550066ff507e37f8f0818429 +size 1064 diff --git a/checkpoint-6104/trainer_state.json b/checkpoint-6104/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..05fcf00d9021ce6c818fdabc7c99bce80811106d --- /dev/null +++ b/checkpoint-6104/trainer_state.json @@ -0,0 +1,1405 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.08963782880848799, + "eval_steps": 500, + "global_step": 6104, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0004552379903445491, + "grad_norm": 4.0020222663879395, + "learning_rate": 1.0157273918741808e-06, + "loss": 0.92, + "step": 31 + }, + { + "epoch": 0.0009104759806890982, + "grad_norm": 3.06746506690979, + "learning_rate": 2.0314547837483616e-06, + "loss": 0.8898, + "step": 62 + }, + { + "epoch": 0.0013657139710336473, + "grad_norm": 3.004636764526367, + "learning_rate": 3.0471821756225426e-06, + "loss": 0.8653, + "step": 93 + }, + { + "epoch": 0.0018209519613781964, + "grad_norm": 3.0733370780944824, + "learning_rate": 4.062909567496723e-06, + "loss": 0.8516, + "step": 124 + }, + { + "epoch": 0.0022761899517227454, + "grad_norm": 2.898719549179077, + "learning_rate": 5.078636959370905e-06, + "loss": 0.8461, + "step": 155 + }, + { + "epoch": 0.0027314279420672946, + "grad_norm": 2.8273258209228516, + "learning_rate": 6.094364351245085e-06, + "loss": 0.8362, + "step": 186 + }, + { + "epoch": 0.0031866659324118435, + "grad_norm": 2.9983041286468506, + "learning_rate": 7.110091743119267e-06, + "loss": 0.8274, + "step": 217 + }, + { + "epoch": 0.0036419039227563927, + "grad_norm": 2.9561767578125, + "learning_rate": 8.125819134993446e-06, + "loss": 0.8373, + "step": 248 + }, + { + "epoch": 0.0040971419131009415, + "grad_norm": 2.732726812362671, + "learning_rate": 9.141546526867629e-06, + "loss": 0.8342, + "step": 279 + }, + { + "epoch": 0.004552379903445491, + "grad_norm": 2.8236358165740967, + "learning_rate": 1.015727391874181e-05, + "loss": 0.8387, + "step": 310 + }, + { + "epoch": 0.00500761789379004, + "grad_norm": 2.8730266094207764, + "learning_rate": 1.117300131061599e-05, + "loss": 0.8328, + "step": 341 + }, + { + "epoch": 0.005462855884134589, + "grad_norm": 2.717439651489258, + "learning_rate": 1.218872870249017e-05, + "loss": 0.8316, + "step": 372 + }, + { + "epoch": 0.005918093874479138, + "grad_norm": 2.9625961780548096, + "learning_rate": 1.3204456094364351e-05, + "loss": 0.8295, + "step": 403 + }, + { + "epoch": 0.006373331864823687, + "grad_norm": 2.6649422645568848, + "learning_rate": 1.4220183486238533e-05, + "loss": 0.8356, + "step": 434 + }, + { + "epoch": 0.006828569855168236, + "grad_norm": 2.9066264629364014, + "learning_rate": 1.5235910878112714e-05, + "loss": 0.8313, + "step": 465 + }, + { + "epoch": 0.0072838078455127854, + "grad_norm": 2.7710695266723633, + "learning_rate": 1.6251638269986893e-05, + "loss": 0.8251, + "step": 496 + }, + { + "epoch": 0.007739045835857335, + "grad_norm": 2.416724443435669, + "learning_rate": 1.7267365661861077e-05, + "loss": 0.8319, + "step": 527 + }, + { + "epoch": 0.008194283826201883, + "grad_norm": 2.583005905151367, + "learning_rate": 1.8283093053735257e-05, + "loss": 0.8265, + "step": 558 + }, + { + "epoch": 0.008649521816546433, + "grad_norm": 2.6601760387420654, + "learning_rate": 1.9298820445609438e-05, + "loss": 0.8328, + "step": 589 + }, + { + "epoch": 0.009104759806890982, + "grad_norm": 2.9271795749664307, + "learning_rate": 2.031454783748362e-05, + "loss": 0.8282, + "step": 620 + }, + { + "epoch": 0.00955999779723553, + "grad_norm": 2.3631091117858887, + "learning_rate": 2.13302752293578e-05, + "loss": 0.8314, + "step": 651 + }, + { + "epoch": 0.01001523578758008, + "grad_norm": 2.4002573490142822, + "learning_rate": 2.234600262123198e-05, + "loss": 0.829, + "step": 682 + }, + { + "epoch": 0.010470473777924628, + "grad_norm": 2.629352331161499, + "learning_rate": 2.336173001310616e-05, + "loss": 0.8315, + "step": 713 + }, + { + "epoch": 0.010925711768269179, + "grad_norm": 2.3604094982147217, + "learning_rate": 2.437745740498034e-05, + "loss": 0.8338, + "step": 744 + }, + { + "epoch": 0.011380949758613727, + "grad_norm": 2.8708078861236572, + "learning_rate": 2.5393184796854525e-05, + "loss": 0.843, + "step": 775 + }, + { + "epoch": 0.011836187748958275, + "grad_norm": 2.7403945922851562, + "learning_rate": 2.6408912188728702e-05, + "loss": 0.8372, + "step": 806 + }, + { + "epoch": 0.012291425739302825, + "grad_norm": 2.733816623687744, + "learning_rate": 2.7424639580602886e-05, + "loss": 0.8327, + "step": 837 + }, + { + "epoch": 0.012746663729647374, + "grad_norm": 2.2656960487365723, + "learning_rate": 2.8440366972477066e-05, + "loss": 0.8357, + "step": 868 + }, + { + "epoch": 0.013201901719991924, + "grad_norm": 2.18038010597229, + "learning_rate": 2.9456094364351244e-05, + "loss": 0.8331, + "step": 899 + }, + { + "epoch": 0.013657139710336472, + "grad_norm": 2.497558832168579, + "learning_rate": 3.0471821756225428e-05, + "loss": 0.8271, + "step": 930 + }, + { + "epoch": 0.01411237770068102, + "grad_norm": 2.322632074356079, + "learning_rate": 3.148754914809961e-05, + "loss": 0.8327, + "step": 961 + }, + { + "epoch": 0.014567615691025571, + "grad_norm": 2.5596141815185547, + "learning_rate": 3.2503276539973785e-05, + "loss": 0.8367, + "step": 992 + }, + { + "epoch": 0.01502285368137012, + "grad_norm": 2.1262409687042236, + "learning_rate": 3.351900393184797e-05, + "loss": 0.8192, + "step": 1023 + }, + { + "epoch": 0.01547809167171467, + "grad_norm": 2.2483584880828857, + "learning_rate": 3.453473132372215e-05, + "loss": 0.8352, + "step": 1054 + }, + { + "epoch": 0.015933329662059216, + "grad_norm": 2.224043130874634, + "learning_rate": 3.555045871559633e-05, + "loss": 0.8362, + "step": 1085 + }, + { + "epoch": 0.016388567652403766, + "grad_norm": 2.104788303375244, + "learning_rate": 3.6566186107470514e-05, + "loss": 0.832, + "step": 1116 + }, + { + "epoch": 0.016843805642748316, + "grad_norm": 2.126499652862549, + "learning_rate": 3.7581913499344695e-05, + "loss": 0.8341, + "step": 1147 + }, + { + "epoch": 0.017299043633092866, + "grad_norm": 2.221691370010376, + "learning_rate": 3.8597640891218876e-05, + "loss": 0.8346, + "step": 1178 + }, + { + "epoch": 0.017754281623437413, + "grad_norm": 2.4445159435272217, + "learning_rate": 3.9613368283093056e-05, + "loss": 0.8415, + "step": 1209 + }, + { + "epoch": 0.018209519613781963, + "grad_norm": 2.321516752243042, + "learning_rate": 4.062909567496724e-05, + "loss": 0.8289, + "step": 1240 + }, + { + "epoch": 0.018664757604126513, + "grad_norm": 2.2673656940460205, + "learning_rate": 4.164482306684142e-05, + "loss": 0.8307, + "step": 1271 + }, + { + "epoch": 0.01911999559447106, + "grad_norm": 2.1191513538360596, + "learning_rate": 4.26605504587156e-05, + "loss": 0.8319, + "step": 1302 + }, + { + "epoch": 0.01957523358481561, + "grad_norm": 2.18985652923584, + "learning_rate": 4.367627785058978e-05, + "loss": 0.8398, + "step": 1333 + }, + { + "epoch": 0.02003047157516016, + "grad_norm": 1.9449113607406616, + "learning_rate": 4.469200524246396e-05, + "loss": 0.8304, + "step": 1364 + }, + { + "epoch": 0.020485709565504707, + "grad_norm": 2.0119357109069824, + "learning_rate": 4.570773263433814e-05, + "loss": 0.82, + "step": 1395 + }, + { + "epoch": 0.020940947555849257, + "grad_norm": 2.0013489723205566, + "learning_rate": 4.672346002621232e-05, + "loss": 0.8275, + "step": 1426 + }, + { + "epoch": 0.021396185546193807, + "grad_norm": 2.0175704956054688, + "learning_rate": 4.77391874180865e-05, + "loss": 0.8348, + "step": 1457 + }, + { + "epoch": 0.021851423536538357, + "grad_norm": 2.0555551052093506, + "learning_rate": 4.875491480996068e-05, + "loss": 0.8322, + "step": 1488 + }, + { + "epoch": 0.022306661526882904, + "grad_norm": 1.9871079921722412, + "learning_rate": 4.977064220183487e-05, + "loss": 0.8253, + "step": 1519 + }, + { + "epoch": 0.022761899517227454, + "grad_norm": 2.1991310119628906, + "learning_rate": 4.9999915451558777e-05, + "loss": 0.8252, + "step": 1550 + }, + { + "epoch": 0.023217137507572004, + "grad_norm": 1.96790611743927, + "learning_rate": 4.999955597496219e-05, + "loss": 0.8242, + "step": 1581 + }, + { + "epoch": 0.02367237549791655, + "grad_norm": 2.0572164058685303, + "learning_rate": 4.9998914381774255e-05, + "loss": 0.833, + "step": 1612 + }, + { + "epoch": 0.0241276134882611, + "grad_norm": 2.0381035804748535, + "learning_rate": 4.999799067923527e-05, + "loss": 0.8351, + "step": 1643 + }, + { + "epoch": 0.02458285147860565, + "grad_norm": 2.145235061645508, + "learning_rate": 4.999678487776908e-05, + "loss": 0.842, + "step": 1674 + }, + { + "epoch": 0.025038089468950198, + "grad_norm": 2.0120856761932373, + "learning_rate": 4.9995296990983006e-05, + "loss": 0.838, + "step": 1705 + }, + { + "epoch": 0.025493327459294748, + "grad_norm": 1.9461947679519653, + "learning_rate": 4.999352703566763e-05, + "loss": 0.8269, + "step": 1736 + }, + { + "epoch": 0.025948565449639298, + "grad_norm": 1.9189060926437378, + "learning_rate": 4.999147503179668e-05, + "loss": 0.8214, + "step": 1767 + }, + { + "epoch": 0.026403803439983848, + "grad_norm": 1.906604290008545, + "learning_rate": 4.998914100252672e-05, + "loss": 0.8249, + "step": 1798 + }, + { + "epoch": 0.026859041430328395, + "grad_norm": 2.0810904502868652, + "learning_rate": 4.998652497419696e-05, + "loss": 0.8143, + "step": 1829 + }, + { + "epoch": 0.027314279420672945, + "grad_norm": 1.9607690572738647, + "learning_rate": 4.9983626976328927e-05, + "loss": 0.8155, + "step": 1860 + }, + { + "epoch": 0.027769517411017495, + "grad_norm": 1.8492732048034668, + "learning_rate": 4.998044704162613e-05, + "loss": 0.8168, + "step": 1891 + }, + { + "epoch": 0.02822475540136204, + "grad_norm": 2.0616824626922607, + "learning_rate": 4.9976985205973705e-05, + "loss": 0.8279, + "step": 1922 + }, + { + "epoch": 0.02867999339170659, + "grad_norm": 1.882591724395752, + "learning_rate": 4.997324150843799e-05, + "loss": 0.8199, + "step": 1953 + }, + { + "epoch": 0.029135231382051142, + "grad_norm": 1.8001904487609863, + "learning_rate": 4.99692159912661e-05, + "loss": 0.8232, + "step": 1984 + }, + { + "epoch": 0.02959046937239569, + "grad_norm": 1.94768226146698, + "learning_rate": 4.996490869988546e-05, + "loss": 0.817, + "step": 2015 + }, + { + "epoch": 0.03004570736274024, + "grad_norm": 1.845452904701233, + "learning_rate": 4.996031968290326e-05, + "loss": 0.8146, + "step": 2046 + }, + { + "epoch": 0.03050094535308479, + "grad_norm": 1.8649152517318726, + "learning_rate": 4.995544899210594e-05, + "loss": 0.8246, + "step": 2077 + }, + { + "epoch": 0.03095618334342934, + "grad_norm": 1.665781021118164, + "learning_rate": 4.9950296682458583e-05, + "loss": 0.8004, + "step": 2108 + }, + { + "epoch": 0.031411421333773885, + "grad_norm": 1.6481060981750488, + "learning_rate": 4.994486281210429e-05, + "loss": 0.7949, + "step": 2139 + }, + { + "epoch": 0.03186665932411843, + "grad_norm": 1.7481175661087036, + "learning_rate": 4.9939147442363566e-05, + "loss": 0.8142, + "step": 2170 + }, + { + "epoch": 0.032321897314462986, + "grad_norm": 1.6130069494247437, + "learning_rate": 4.9933150637733574e-05, + "loss": 0.8126, + "step": 2201 + }, + { + "epoch": 0.03277713530480753, + "grad_norm": 1.7536373138427734, + "learning_rate": 4.992687246588743e-05, + "loss": 0.8149, + "step": 2232 + }, + { + "epoch": 0.033232373295152086, + "grad_norm": 1.721200942993164, + "learning_rate": 4.992031299767347e-05, + "loss": 0.8051, + "step": 2263 + }, + { + "epoch": 0.03368761128549663, + "grad_norm": 3.3350446224212646, + "learning_rate": 4.9913472307114386e-05, + "loss": 0.8007, + "step": 2294 + }, + { + "epoch": 0.03414284927584118, + "grad_norm": 1.848281741142273, + "learning_rate": 4.9906350471406446e-05, + "loss": 0.8076, + "step": 2325 + }, + { + "epoch": 0.03459808726618573, + "grad_norm": 1.6283164024353027, + "learning_rate": 4.989894757091861e-05, + "loss": 0.8084, + "step": 2356 + }, + { + "epoch": 0.03505332525653028, + "grad_norm": 1.6567096710205078, + "learning_rate": 4.989126368919158e-05, + "loss": 0.8075, + "step": 2387 + }, + { + "epoch": 0.035508563246874826, + "grad_norm": 1.6488817930221558, + "learning_rate": 4.988329891293693e-05, + "loss": 0.7994, + "step": 2418 + }, + { + "epoch": 0.03596380123721938, + "grad_norm": 1.9465230703353882, + "learning_rate": 4.987505333203608e-05, + "loss": 0.8064, + "step": 2449 + }, + { + "epoch": 0.036419039227563926, + "grad_norm": 1.722517967224121, + "learning_rate": 4.9866527039539276e-05, + "loss": 0.8005, + "step": 2480 + }, + { + "epoch": 0.03687427721790847, + "grad_norm": 1.8509281873703003, + "learning_rate": 4.9857720131664594e-05, + "loss": 0.8094, + "step": 2511 + }, + { + "epoch": 0.03732951520825303, + "grad_norm": 1.7140249013900757, + "learning_rate": 4.9848632707796773e-05, + "loss": 0.8018, + "step": 2542 + }, + { + "epoch": 0.03778475319859757, + "grad_norm": 1.9315160512924194, + "learning_rate": 4.9839264870486155e-05, + "loss": 0.8094, + "step": 2573 + }, + { + "epoch": 0.03823999118894212, + "grad_norm": 1.728063702583313, + "learning_rate": 4.9829616725447526e-05, + "loss": 0.8052, + "step": 2604 + }, + { + "epoch": 0.03869522917928667, + "grad_norm": 1.588205099105835, + "learning_rate": 4.981968838155888e-05, + "loss": 0.7983, + "step": 2635 + }, + { + "epoch": 0.03915046716963122, + "grad_norm": 1.6086421012878418, + "learning_rate": 4.980947995086024e-05, + "loss": 0.7961, + "step": 2666 + }, + { + "epoch": 0.03960570515997577, + "grad_norm": 1.680635929107666, + "learning_rate": 4.979899154855234e-05, + "loss": 0.8033, + "step": 2697 + }, + { + "epoch": 0.04006094315032032, + "grad_norm": 1.6833465099334717, + "learning_rate": 4.9788223292995386e-05, + "loss": 0.793, + "step": 2728 + }, + { + "epoch": 0.04051618114066487, + "grad_norm": 1.5738922357559204, + "learning_rate": 4.977717530570768e-05, + "loss": 0.7837, + "step": 2759 + }, + { + "epoch": 0.040971419131009414, + "grad_norm": 1.6513015031814575, + "learning_rate": 4.976584771136425e-05, + "loss": 0.7969, + "step": 2790 + }, + { + "epoch": 0.04142665712135397, + "grad_norm": 1.7180182933807373, + "learning_rate": 4.975424063779547e-05, + "loss": 0.7994, + "step": 2821 + }, + { + "epoch": 0.041881895111698514, + "grad_norm": 1.5236577987670898, + "learning_rate": 4.974235421598557e-05, + "loss": 0.7952, + "step": 2852 + }, + { + "epoch": 0.04233713310204307, + "grad_norm": 1.760188341140747, + "learning_rate": 4.973018858007122e-05, + "loss": 0.795, + "step": 2883 + }, + { + "epoch": 0.042792371092387614, + "grad_norm": 1.5314075946807861, + "learning_rate": 4.9717743867339963e-05, + "loss": 0.7884, + "step": 2914 + }, + { + "epoch": 0.04324760908273216, + "grad_norm": 1.7256252765655518, + "learning_rate": 4.9705020218228695e-05, + "loss": 0.7883, + "step": 2945 + }, + { + "epoch": 0.043702847073076714, + "grad_norm": 1.6262825727462769, + "learning_rate": 4.969201777632205e-05, + "loss": 0.8007, + "step": 2976 + }, + { + "epoch": 0.04415808506342126, + "grad_norm": 1.4939130544662476, + "learning_rate": 4.9678736688350846e-05, + "loss": 0.7862, + "step": 3007 + }, + { + "epoch": 0.04461332305376581, + "grad_norm": 1.6776609420776367, + "learning_rate": 4.966517710419033e-05, + "loss": 0.7977, + "step": 3038 + }, + { + "epoch": 0.04506856104411036, + "grad_norm": 1.7425099611282349, + "learning_rate": 4.965133917685858e-05, + "loss": 0.789, + "step": 3069 + }, + { + "epoch": 0.04552379903445491, + "grad_norm": 1.5114233493804932, + "learning_rate": 4.9637223062514714e-05, + "loss": 0.7915, + "step": 3100 + }, + { + "epoch": 0.045979037024799455, + "grad_norm": 1.6397335529327393, + "learning_rate": 4.962282892045718e-05, + "loss": 0.8127, + "step": 3131 + }, + { + "epoch": 0.04643427501514401, + "grad_norm": 1.7266038656234741, + "learning_rate": 4.9608156913121904e-05, + "loss": 0.827, + "step": 3162 + }, + { + "epoch": 0.046889513005488555, + "grad_norm": 1.5216429233551025, + "learning_rate": 4.959320720608049e-05, + "loss": 0.8146, + "step": 3193 + }, + { + "epoch": 0.0473447509958331, + "grad_norm": 1.9235813617706299, + "learning_rate": 4.9577979968038354e-05, + "loss": 0.8118, + "step": 3224 + }, + { + "epoch": 0.047799988986177655, + "grad_norm": 1.6768368482589722, + "learning_rate": 4.956247537083282e-05, + "loss": 0.7925, + "step": 3255 + }, + { + "epoch": 0.0482552269765222, + "grad_norm": 1.6357700824737549, + "learning_rate": 4.9546693589431145e-05, + "loss": 0.7743, + "step": 3286 + }, + { + "epoch": 0.04871046496686675, + "grad_norm": 1.6823676824569702, + "learning_rate": 4.9530634801928595e-05, + "loss": 0.7949, + "step": 3317 + }, + { + "epoch": 0.0491657029572113, + "grad_norm": 1.4716497659683228, + "learning_rate": 4.9514299189546395e-05, + "loss": 0.7755, + "step": 3348 + }, + { + "epoch": 0.04962094094755585, + "grad_norm": 3.1948273181915283, + "learning_rate": 4.949768693662973e-05, + "loss": 0.7867, + "step": 3379 + }, + { + "epoch": 0.050076178937900395, + "grad_norm": 1.7412703037261963, + "learning_rate": 4.948079823064559e-05, + "loss": 0.788, + "step": 3410 + }, + { + "epoch": 0.05053141692824495, + "grad_norm": 1.5544873476028442, + "learning_rate": 4.946363326218074e-05, + "loss": 0.7752, + "step": 3441 + }, + { + "epoch": 0.050986654918589495, + "grad_norm": 1.5616456270217896, + "learning_rate": 4.9446192224939525e-05, + "loss": 0.7752, + "step": 3472 + }, + { + "epoch": 0.05144189290893404, + "grad_norm": 1.4451948404312134, + "learning_rate": 4.942847531574167e-05, + "loss": 0.7786, + "step": 3503 + }, + { + "epoch": 0.051897130899278596, + "grad_norm": 1.5473634004592896, + "learning_rate": 4.941048273452008e-05, + "loss": 0.7824, + "step": 3534 + }, + { + "epoch": 0.05235236888962314, + "grad_norm": 1.4772653579711914, + "learning_rate": 4.9392214684318605e-05, + "loss": 0.7873, + "step": 3565 + }, + { + "epoch": 0.052807606879967696, + "grad_norm": 1.4708329439163208, + "learning_rate": 4.93736713712897e-05, + "loss": 0.7851, + "step": 3596 + }, + { + "epoch": 0.05326284487031224, + "grad_norm": 1.469369649887085, + "learning_rate": 4.9354853004692124e-05, + "loss": 0.7874, + "step": 3627 + }, + { + "epoch": 0.05371808286065679, + "grad_norm": 1.3600234985351562, + "learning_rate": 4.93357597968886e-05, + "loss": 0.7792, + "step": 3658 + }, + { + "epoch": 0.05417332085100134, + "grad_norm": 1.6479856967926025, + "learning_rate": 4.931639196334338e-05, + "loss": 0.7886, + "step": 3689 + }, + { + "epoch": 0.05462855884134589, + "grad_norm": 1.5576198101043701, + "learning_rate": 4.9296749722619826e-05, + "loss": 0.7873, + "step": 3720 + }, + { + "epoch": 0.055083796831690436, + "grad_norm": 1.5949997901916504, + "learning_rate": 4.9276833296377966e-05, + "loss": 0.7794, + "step": 3751 + }, + { + "epoch": 0.05553903482203499, + "grad_norm": 1.3033227920532227, + "learning_rate": 4.925664290937196e-05, + "loss": 0.7763, + "step": 3782 + }, + { + "epoch": 0.055994272812379536, + "grad_norm": 1.5083190202713013, + "learning_rate": 4.9236178789447576e-05, + "loss": 0.7796, + "step": 3813 + }, + { + "epoch": 0.05644951080272408, + "grad_norm": 1.4691548347473145, + "learning_rate": 4.921544116753962e-05, + "loss": 0.7744, + "step": 3844 + }, + { + "epoch": 0.05690474879306864, + "grad_norm": 1.4447206258773804, + "learning_rate": 4.919443027766935e-05, + "loss": 0.7812, + "step": 3875 + }, + { + "epoch": 0.05735998678341318, + "grad_norm": 1.4616878032684326, + "learning_rate": 4.91731463569418e-05, + "loss": 0.7824, + "step": 3906 + }, + { + "epoch": 0.05781522477375773, + "grad_norm": 1.4808290004730225, + "learning_rate": 4.915158964554312e-05, + "loss": 0.7777, + "step": 3937 + }, + { + "epoch": 0.058270462764102283, + "grad_norm": 1.4712798595428467, + "learning_rate": 4.912976038673786e-05, + "loss": 0.7801, + "step": 3968 + }, + { + "epoch": 0.05872570075444683, + "grad_norm": 1.4899425506591797, + "learning_rate": 4.9107658826866254e-05, + "loss": 0.7852, + "step": 3999 + }, + { + "epoch": 0.05918093874479138, + "grad_norm": 1.2921983003616333, + "learning_rate": 4.908528521534139e-05, + "loss": 0.7735, + "step": 4030 + }, + { + "epoch": 0.05963617673513593, + "grad_norm": 1.3849194049835205, + "learning_rate": 4.906263980464644e-05, + "loss": 0.7893, + "step": 4061 + }, + { + "epoch": 0.06009141472548048, + "grad_norm": 1.442808747291565, + "learning_rate": 4.903972285033178e-05, + "loss": 0.784, + "step": 4092 + }, + { + "epoch": 0.060546652715825024, + "grad_norm": 1.4925036430358887, + "learning_rate": 4.901653461101213e-05, + "loss": 0.7714, + "step": 4123 + }, + { + "epoch": 0.06100189070616958, + "grad_norm": 1.4630024433135986, + "learning_rate": 4.8993075348363626e-05, + "loss": 0.7677, + "step": 4154 + }, + { + "epoch": 0.061457128696514124, + "grad_norm": 1.344081163406372, + "learning_rate": 4.896934532712084e-05, + "loss": 0.7788, + "step": 4185 + }, + { + "epoch": 0.06191236668685868, + "grad_norm": 1.449648380279541, + "learning_rate": 4.8945344815073846e-05, + "loss": 0.7661, + "step": 4216 + }, + { + "epoch": 0.062367604677203224, + "grad_norm": 1.450361728668213, + "learning_rate": 4.892107408306516e-05, + "loss": 0.7698, + "step": 4247 + }, + { + "epoch": 0.06282284266754777, + "grad_norm": 1.5068191289901733, + "learning_rate": 4.889653340498669e-05, + "loss": 0.7758, + "step": 4278 + }, + { + "epoch": 0.06327808065789232, + "grad_norm": 1.3638824224472046, + "learning_rate": 4.8871723057776664e-05, + "loss": 0.7702, + "step": 4309 + }, + { + "epoch": 0.06373331864823686, + "grad_norm": 1.4669525623321533, + "learning_rate": 4.8846643321416476e-05, + "loss": 0.7755, + "step": 4340 + }, + { + "epoch": 0.06418855663858142, + "grad_norm": 1.502684235572815, + "learning_rate": 4.882129447892753e-05, + "loss": 0.7677, + "step": 4371 + }, + { + "epoch": 0.06464379462892597, + "grad_norm": 1.5910828113555908, + "learning_rate": 4.8795676816368076e-05, + "loss": 0.7646, + "step": 4402 + }, + { + "epoch": 0.06509903261927051, + "grad_norm": 1.4607537984848022, + "learning_rate": 4.876979062282995e-05, + "loss": 0.7684, + "step": 4433 + }, + { + "epoch": 0.06555427060961506, + "grad_norm": 1.3823063373565674, + "learning_rate": 4.8743636190435325e-05, + "loss": 0.7695, + "step": 4464 + }, + { + "epoch": 0.06600950859995962, + "grad_norm": 1.3769381046295166, + "learning_rate": 4.871721381433344e-05, + "loss": 0.7746, + "step": 4495 + }, + { + "epoch": 0.06646474659030417, + "grad_norm": 1.3154246807098389, + "learning_rate": 4.869052379269719e-05, + "loss": 0.7699, + "step": 4526 + }, + { + "epoch": 0.06691998458064871, + "grad_norm": 1.4395617246627808, + "learning_rate": 4.866356642671985e-05, + "loss": 0.7654, + "step": 4557 + }, + { + "epoch": 0.06737522257099327, + "grad_norm": 1.333095908164978, + "learning_rate": 4.8636342020611634e-05, + "loss": 0.7729, + "step": 4588 + }, + { + "epoch": 0.06783046056133782, + "grad_norm": 1.3427667617797852, + "learning_rate": 4.860885088159626e-05, + "loss": 0.7701, + "step": 4619 + }, + { + "epoch": 0.06828569855168236, + "grad_norm": 1.4095017910003662, + "learning_rate": 4.858109331990751e-05, + "loss": 0.7711, + "step": 4650 + }, + { + "epoch": 0.06874093654202691, + "grad_norm": 1.4199680089950562, + "learning_rate": 4.855306964878567e-05, + "loss": 0.7797, + "step": 4681 + }, + { + "epoch": 0.06919617453237147, + "grad_norm": 1.3505098819732666, + "learning_rate": 4.8524780184474084e-05, + "loss": 0.7702, + "step": 4712 + }, + { + "epoch": 0.069651412522716, + "grad_norm": 1.3765984773635864, + "learning_rate": 4.8496225246215496e-05, + "loss": 0.7647, + "step": 4743 + }, + { + "epoch": 0.07010665051306056, + "grad_norm": 1.539703369140625, + "learning_rate": 4.8467405156248505e-05, + "loss": 0.7673, + "step": 4774 + }, + { + "epoch": 0.07056188850340511, + "grad_norm": 1.6388696432113647, + "learning_rate": 4.843832023980392e-05, + "loss": 0.7672, + "step": 4805 + }, + { + "epoch": 0.07101712649374965, + "grad_norm": 1.8714436292648315, + "learning_rate": 4.840897082510106e-05, + "loss": 0.7661, + "step": 4836 + }, + { + "epoch": 0.0714723644840942, + "grad_norm": 1.5175424814224243, + "learning_rate": 4.8379357243344084e-05, + "loss": 0.7709, + "step": 4867 + }, + { + "epoch": 0.07192760247443876, + "grad_norm": 1.3103501796722412, + "learning_rate": 4.8349479828718236e-05, + "loss": 0.7701, + "step": 4898 + }, + { + "epoch": 0.0723828404647833, + "grad_norm": 1.4692376852035522, + "learning_rate": 4.8319338918386075e-05, + "loss": 0.7602, + "step": 4929 + }, + { + "epoch": 0.07283807845512785, + "grad_norm": 1.390416145324707, + "learning_rate": 4.828893485248369e-05, + "loss": 0.7896, + "step": 4960 + }, + { + "epoch": 0.0732933164454724, + "grad_norm": 1.4157830476760864, + "learning_rate": 4.825826797411682e-05, + "loss": 0.7675, + "step": 4991 + }, + { + "epoch": 0.07374855443581695, + "grad_norm": 1.2405292987823486, + "learning_rate": 4.822733862935702e-05, + "loss": 0.7679, + "step": 5022 + }, + { + "epoch": 0.0742037924261615, + "grad_norm": 1.4902536869049072, + "learning_rate": 4.819614716723775e-05, + "loss": 0.7693, + "step": 5053 + }, + { + "epoch": 0.07465903041650605, + "grad_norm": 1.4391045570373535, + "learning_rate": 4.8164693939750425e-05, + "loss": 0.7699, + "step": 5084 + }, + { + "epoch": 0.07511426840685059, + "grad_norm": 1.4432806968688965, + "learning_rate": 4.813297930184042e-05, + "loss": 0.763, + "step": 5115 + }, + { + "epoch": 0.07556950639719515, + "grad_norm": 1.3630146980285645, + "learning_rate": 4.810100361140314e-05, + "loss": 0.7618, + "step": 5146 + }, + { + "epoch": 0.0760247443875397, + "grad_norm": 1.5038788318634033, + "learning_rate": 4.8068767229279885e-05, + "loss": 0.7685, + "step": 5177 + }, + { + "epoch": 0.07647998237788424, + "grad_norm": 1.4100326299667358, + "learning_rate": 4.8036270519253854e-05, + "loss": 0.7645, + "step": 5208 + }, + { + "epoch": 0.0769352203682288, + "grad_norm": 1.377821445465088, + "learning_rate": 4.8003513848046e-05, + "loss": 0.7643, + "step": 5239 + }, + { + "epoch": 0.07739045835857335, + "grad_norm": 1.3848881721496582, + "learning_rate": 4.79704975853109e-05, + "loss": 0.7711, + "step": 5270 + }, + { + "epoch": 0.07784569634891789, + "grad_norm": 1.6327494382858276, + "learning_rate": 4.793722210363262e-05, + "loss": 0.7591, + "step": 5301 + }, + { + "epoch": 0.07830093433926244, + "grad_norm": 1.51273512840271, + "learning_rate": 4.7903687778520414e-05, + "loss": 0.7697, + "step": 5332 + }, + { + "epoch": 0.078756172329607, + "grad_norm": 1.447487235069275, + "learning_rate": 4.7869894988404593e-05, + "loss": 0.7602, + "step": 5363 + }, + { + "epoch": 0.07921141031995153, + "grad_norm": 1.3227125406265259, + "learning_rate": 4.783584411463221e-05, + "loss": 0.7616, + "step": 5394 + }, + { + "epoch": 0.07966664831029609, + "grad_norm": 1.4407910108566284, + "learning_rate": 4.780153554146274e-05, + "loss": 0.7618, + "step": 5425 + }, + { + "epoch": 0.08012188630064064, + "grad_norm": 1.2349289655685425, + "learning_rate": 4.7766969656063766e-05, + "loss": 0.7688, + "step": 5456 + }, + { + "epoch": 0.08057712429098518, + "grad_norm": 1.3858132362365723, + "learning_rate": 4.773214684850662e-05, + "loss": 0.7517, + "step": 5487 + }, + { + "epoch": 0.08103236228132973, + "grad_norm": 1.2951774597167969, + "learning_rate": 4.769706751176193e-05, + "loss": 0.7588, + "step": 5518 + }, + { + "epoch": 0.08148760027167429, + "grad_norm": 1.2295372486114502, + "learning_rate": 4.7661732041695264e-05, + "loss": 0.7522, + "step": 5549 + }, + { + "epoch": 0.08194283826201883, + "grad_norm": 1.4258685111999512, + "learning_rate": 4.762614083706258e-05, + "loss": 0.7666, + "step": 5580 + }, + { + "epoch": 0.08239807625236338, + "grad_norm": 1.5323545932769775, + "learning_rate": 4.759029429950581e-05, + "loss": 0.7587, + "step": 5611 + }, + { + "epoch": 0.08285331424270793, + "grad_norm": 1.3712198734283447, + "learning_rate": 4.7554192833548235e-05, + "loss": 0.7597, + "step": 5642 + }, + { + "epoch": 0.08330855223305247, + "grad_norm": 1.389145016670227, + "learning_rate": 4.751783684659e-05, + "loss": 0.7484, + "step": 5673 + }, + { + "epoch": 0.08376379022339703, + "grad_norm": 1.466799020767212, + "learning_rate": 4.748122674890348e-05, + "loss": 0.7586, + "step": 5704 + }, + { + "epoch": 0.08421902821374158, + "grad_norm": 1.4522569179534912, + "learning_rate": 4.7444362953628654e-05, + "loss": 0.7615, + "step": 5735 + }, + { + "epoch": 0.08467426620408613, + "grad_norm": 1.6374691724777222, + "learning_rate": 4.7407245876768424e-05, + "loss": 0.7573, + "step": 5766 + }, + { + "epoch": 0.08512950419443067, + "grad_norm": 1.384748101234436, + "learning_rate": 4.736987593718397e-05, + "loss": 0.7516, + "step": 5797 + }, + { + "epoch": 0.08558474218477523, + "grad_norm": 1.366335153579712, + "learning_rate": 4.733225355658999e-05, + "loss": 0.7681, + "step": 5828 + }, + { + "epoch": 0.08603998017511978, + "grad_norm": 1.3855708837509155, + "learning_rate": 4.7294379159549926e-05, + "loss": 0.7557, + "step": 5859 + }, + { + "epoch": 0.08649521816546432, + "grad_norm": 1.3614453077316284, + "learning_rate": 4.725625317347119e-05, + "loss": 0.7624, + "step": 5890 + }, + { + "epoch": 0.08695045615580888, + "grad_norm": 1.2072994709014893, + "learning_rate": 4.7217876028600374e-05, + "loss": 0.7545, + "step": 5921 + }, + { + "epoch": 0.08740569414615343, + "grad_norm": 1.5530472993850708, + "learning_rate": 4.717924815801832e-05, + "loss": 0.7574, + "step": 5952 + }, + { + "epoch": 0.08786093213649797, + "grad_norm": 1.3522703647613525, + "learning_rate": 4.714036999763532e-05, + "loss": 0.7608, + "step": 5983 + }, + { + "epoch": 0.08831617012684252, + "grad_norm": 1.342637300491333, + "learning_rate": 4.7101241986186116e-05, + "loss": 0.7519, + "step": 6014 + }, + { + "epoch": 0.08877140811718708, + "grad_norm": 1.2783684730529785, + "learning_rate": 4.7061864565225e-05, + "loss": 0.7565, + "step": 6045 + }, + { + "epoch": 0.08922664610753162, + "grad_norm": 1.1245291233062744, + "learning_rate": 4.702223817912081e-05, + "loss": 0.7655, + "step": 6076 + } + ], + "logging_steps": 31, + "max_steps": 30517, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 3052, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.503312157693182e+18, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-6104/training_args.bin b/checkpoint-6104/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..dd0c30645908fd59ad9d6091c5e2e08018856b49 --- /dev/null +++ b/checkpoint-6104/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2d2ae676cb4770c8405bd6fc6962f1ef1289d669b20872c9a947682a2673db22 +size 5304 diff --git a/checkpoint-9156/config.json b/checkpoint-9156/config.json new file mode 100644 index 0000000000000000000000000000000000000000..1a2e378a7091e0c0c8eaa10d0bd7796835fe4b64 --- /dev/null +++ b/checkpoint-9156/config.json @@ -0,0 +1,39 @@ +{ + "_name_or_path": "meta-llama/Llama-3.1-8B-Instruct", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 8.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "torch_dtype": "float32", + "transformers_version": "4.44.0.dev0", + "use_cache": true, + "vocab_size": 128256 +} diff --git a/checkpoint-9156/generation_config.json b/checkpoint-9156/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c0d952a3697e90a24460d766a494a8bb9cf80534 --- /dev/null +++ b/checkpoint-9156/generation_config.json @@ -0,0 +1,12 @@ +{ + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.44.0.dev0" +} diff --git a/checkpoint-9156/model-00001-of-00007.safetensors b/checkpoint-9156/model-00001-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3a50f3244f0e13dc92b42211e505406fae4ce118 --- /dev/null +++ b/checkpoint-9156/model-00001-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:44b1e06d27f0129dd85b0b364d8dcb86693526d2c7de30bd1741c023e596534a +size 4886466168 diff --git a/checkpoint-9156/model-00002-of-00007.safetensors b/checkpoint-9156/model-00002-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..fbc8e97904ed5acdba6240901a52d7ea3c73628d --- /dev/null +++ b/checkpoint-9156/model-00002-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:23fa5aaf8ac56c17c821980c9aee44fcbc2a617fc570e875e6fdb93d92886c72 +size 4832007448 diff --git a/checkpoint-9156/model-00003-of-00007.safetensors b/checkpoint-9156/model-00003-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..aeab8b855338147792249baf9c680bfb29ea42a5 --- /dev/null +++ b/checkpoint-9156/model-00003-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a223e87240e50dfb05634ea909cf1ec715d37a12b681b14a3a6e97f28688f2b +size 4999813112 diff --git a/checkpoint-9156/model-00004-of-00007.safetensors b/checkpoint-9156/model-00004-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ac9e7b6b5edbe63eec2dd4c7d3310db1f10fdda9 --- /dev/null +++ b/checkpoint-9156/model-00004-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3047d199c611c6d12b1cca780f033ad89df9c076f742674f7ac4a892ca9573b3 +size 4999813128 diff --git a/checkpoint-9156/model-00005-of-00007.safetensors b/checkpoint-9156/model-00005-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..48718503a952853ffd5e231b9dba74b7026e0e01 --- /dev/null +++ b/checkpoint-9156/model-00005-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a2f91d749d664ecd22c71f3d54488966fc6899a03924b8fea3613c8f83398a8 +size 4832007496 diff --git a/checkpoint-9156/model-00006-of-00007.safetensors b/checkpoint-9156/model-00006-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3536ec2c5d8f019af87152eaea589941c6781854 --- /dev/null +++ b/checkpoint-9156/model-00006-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:82ce0d3e353709bf38529c6f96057355c8229edaae83f84de21f372967d49609 +size 4999813120 diff --git a/checkpoint-9156/model-00007-of-00007.safetensors b/checkpoint-9156/model-00007-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..f33cc3e7f54631bfc48d404832e766f7dccb6101 --- /dev/null +++ b/checkpoint-9156/model-00007-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dd3cede9b04875eb5a0f0cdab2e7d5286e9a4998d73cfb0426d90aa65e5fab6b +size 2571158184 diff --git a/checkpoint-9156/model.safetensors.index.json b/checkpoint-9156/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..0985084320f65a678bea622edb50d3771af4fd13 --- /dev/null +++ b/checkpoint-9156/model.safetensors.index.json @@ -0,0 +1,298 @@ +{ + "metadata": { + "total_size": 32121044992 + }, + "weight_map": { + "lm_head.weight": "model-00007-of-00007.safetensors", + "model.embed_tokens.weight": "model-00001-of-00007.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.10.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.15.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.20.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.21.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.26.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.3.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.30.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.input_layernorm.weight": "model-00007-of-00007.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00007-of-00007.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00007-of-00007.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00007-of-00007.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.4.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.9.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.norm.weight": "model-00007-of-00007.safetensors" + } +} diff --git a/checkpoint-9156/optimizer.pt b/checkpoint-9156/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..f61fbac8d55b486d20c3ec98cf87fd069f016182 --- /dev/null +++ b/checkpoint-9156/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db3671e1b395e48ecd20812e2f37f1b7e52fcff0fefed5b2e8d8415452db5fc4 +size 15385036334 diff --git a/checkpoint-9156/rng_state.pth b/checkpoint-9156/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..897ad19948758b40c785c678c136c4029433a27a --- /dev/null +++ b/checkpoint-9156/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d9cd6a0487226e5bd30d1846894c82af483733ab4381b75bae9c0745e05d405 +size 14244 diff --git a/checkpoint-9156/scheduler.pt b/checkpoint-9156/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..5456a295e7e9e24785bebf5e96ccb62dbbac4f62 --- /dev/null +++ b/checkpoint-9156/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7c3f410c61b11096714461ebc2a4aa1b4573d0d0c3eb997bda14fafb34cdc922 +size 1064 diff --git a/checkpoint-9156/trainer_state.json b/checkpoint-9156/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..c9e1f9b0e61916e9456f84453f41c2d8df235220 --- /dev/null +++ b/checkpoint-9156/trainer_state.json @@ -0,0 +1,2098 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.134456743212732, + "eval_steps": 500, + "global_step": 9156, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0004552379903445491, + "grad_norm": 4.0020222663879395, + "learning_rate": 1.0157273918741808e-06, + "loss": 0.92, + "step": 31 + }, + { + "epoch": 0.0009104759806890982, + "grad_norm": 3.06746506690979, + "learning_rate": 2.0314547837483616e-06, + "loss": 0.8898, + "step": 62 + }, + { + "epoch": 0.0013657139710336473, + "grad_norm": 3.004636764526367, + "learning_rate": 3.0471821756225426e-06, + "loss": 0.8653, + "step": 93 + }, + { + "epoch": 0.0018209519613781964, + "grad_norm": 3.0733370780944824, + "learning_rate": 4.062909567496723e-06, + "loss": 0.8516, + "step": 124 + }, + { + "epoch": 0.0022761899517227454, + "grad_norm": 2.898719549179077, + "learning_rate": 5.078636959370905e-06, + "loss": 0.8461, + "step": 155 + }, + { + "epoch": 0.0027314279420672946, + "grad_norm": 2.8273258209228516, + "learning_rate": 6.094364351245085e-06, + "loss": 0.8362, + "step": 186 + }, + { + "epoch": 0.0031866659324118435, + "grad_norm": 2.9983041286468506, + "learning_rate": 7.110091743119267e-06, + "loss": 0.8274, + "step": 217 + }, + { + "epoch": 0.0036419039227563927, + "grad_norm": 2.9561767578125, + "learning_rate": 8.125819134993446e-06, + "loss": 0.8373, + "step": 248 + }, + { + "epoch": 0.0040971419131009415, + "grad_norm": 2.732726812362671, + "learning_rate": 9.141546526867629e-06, + "loss": 0.8342, + "step": 279 + }, + { + "epoch": 0.004552379903445491, + "grad_norm": 2.8236358165740967, + "learning_rate": 1.015727391874181e-05, + "loss": 0.8387, + "step": 310 + }, + { + "epoch": 0.00500761789379004, + "grad_norm": 2.8730266094207764, + "learning_rate": 1.117300131061599e-05, + "loss": 0.8328, + "step": 341 + }, + { + "epoch": 0.005462855884134589, + "grad_norm": 2.717439651489258, + "learning_rate": 1.218872870249017e-05, + "loss": 0.8316, + "step": 372 + }, + { + "epoch": 0.005918093874479138, + "grad_norm": 2.9625961780548096, + "learning_rate": 1.3204456094364351e-05, + "loss": 0.8295, + "step": 403 + }, + { + "epoch": 0.006373331864823687, + "grad_norm": 2.6649422645568848, + "learning_rate": 1.4220183486238533e-05, + "loss": 0.8356, + "step": 434 + }, + { + "epoch": 0.006828569855168236, + "grad_norm": 2.9066264629364014, + "learning_rate": 1.5235910878112714e-05, + "loss": 0.8313, + "step": 465 + }, + { + "epoch": 0.0072838078455127854, + "grad_norm": 2.7710695266723633, + "learning_rate": 1.6251638269986893e-05, + "loss": 0.8251, + "step": 496 + }, + { + "epoch": 0.007739045835857335, + "grad_norm": 2.416724443435669, + "learning_rate": 1.7267365661861077e-05, + "loss": 0.8319, + "step": 527 + }, + { + "epoch": 0.008194283826201883, + "grad_norm": 2.583005905151367, + "learning_rate": 1.8283093053735257e-05, + "loss": 0.8265, + "step": 558 + }, + { + "epoch": 0.008649521816546433, + "grad_norm": 2.6601760387420654, + "learning_rate": 1.9298820445609438e-05, + "loss": 0.8328, + "step": 589 + }, + { + "epoch": 0.009104759806890982, + "grad_norm": 2.9271795749664307, + "learning_rate": 2.031454783748362e-05, + "loss": 0.8282, + "step": 620 + }, + { + "epoch": 0.00955999779723553, + "grad_norm": 2.3631091117858887, + "learning_rate": 2.13302752293578e-05, + "loss": 0.8314, + "step": 651 + }, + { + "epoch": 0.01001523578758008, + "grad_norm": 2.4002573490142822, + "learning_rate": 2.234600262123198e-05, + "loss": 0.829, + "step": 682 + }, + { + "epoch": 0.010470473777924628, + "grad_norm": 2.629352331161499, + "learning_rate": 2.336173001310616e-05, + "loss": 0.8315, + "step": 713 + }, + { + "epoch": 0.010925711768269179, + "grad_norm": 2.3604094982147217, + "learning_rate": 2.437745740498034e-05, + "loss": 0.8338, + "step": 744 + }, + { + "epoch": 0.011380949758613727, + "grad_norm": 2.8708078861236572, + "learning_rate": 2.5393184796854525e-05, + "loss": 0.843, + "step": 775 + }, + { + "epoch": 0.011836187748958275, + "grad_norm": 2.7403945922851562, + "learning_rate": 2.6408912188728702e-05, + "loss": 0.8372, + "step": 806 + }, + { + "epoch": 0.012291425739302825, + "grad_norm": 2.733816623687744, + "learning_rate": 2.7424639580602886e-05, + "loss": 0.8327, + "step": 837 + }, + { + "epoch": 0.012746663729647374, + "grad_norm": 2.2656960487365723, + "learning_rate": 2.8440366972477066e-05, + "loss": 0.8357, + "step": 868 + }, + { + "epoch": 0.013201901719991924, + "grad_norm": 2.18038010597229, + "learning_rate": 2.9456094364351244e-05, + "loss": 0.8331, + "step": 899 + }, + { + "epoch": 0.013657139710336472, + "grad_norm": 2.497558832168579, + "learning_rate": 3.0471821756225428e-05, + "loss": 0.8271, + "step": 930 + }, + { + "epoch": 0.01411237770068102, + "grad_norm": 2.322632074356079, + "learning_rate": 3.148754914809961e-05, + "loss": 0.8327, + "step": 961 + }, + { + "epoch": 0.014567615691025571, + "grad_norm": 2.5596141815185547, + "learning_rate": 3.2503276539973785e-05, + "loss": 0.8367, + "step": 992 + }, + { + "epoch": 0.01502285368137012, + "grad_norm": 2.1262409687042236, + "learning_rate": 3.351900393184797e-05, + "loss": 0.8192, + "step": 1023 + }, + { + "epoch": 0.01547809167171467, + "grad_norm": 2.2483584880828857, + "learning_rate": 3.453473132372215e-05, + "loss": 0.8352, + "step": 1054 + }, + { + "epoch": 0.015933329662059216, + "grad_norm": 2.224043130874634, + "learning_rate": 3.555045871559633e-05, + "loss": 0.8362, + "step": 1085 + }, + { + "epoch": 0.016388567652403766, + "grad_norm": 2.104788303375244, + "learning_rate": 3.6566186107470514e-05, + "loss": 0.832, + "step": 1116 + }, + { + "epoch": 0.016843805642748316, + "grad_norm": 2.126499652862549, + "learning_rate": 3.7581913499344695e-05, + "loss": 0.8341, + "step": 1147 + }, + { + "epoch": 0.017299043633092866, + "grad_norm": 2.221691370010376, + "learning_rate": 3.8597640891218876e-05, + "loss": 0.8346, + "step": 1178 + }, + { + "epoch": 0.017754281623437413, + "grad_norm": 2.4445159435272217, + "learning_rate": 3.9613368283093056e-05, + "loss": 0.8415, + "step": 1209 + }, + { + "epoch": 0.018209519613781963, + "grad_norm": 2.321516752243042, + "learning_rate": 4.062909567496724e-05, + "loss": 0.8289, + "step": 1240 + }, + { + "epoch": 0.018664757604126513, + "grad_norm": 2.2673656940460205, + "learning_rate": 4.164482306684142e-05, + "loss": 0.8307, + "step": 1271 + }, + { + "epoch": 0.01911999559447106, + "grad_norm": 2.1191513538360596, + "learning_rate": 4.26605504587156e-05, + "loss": 0.8319, + "step": 1302 + }, + { + "epoch": 0.01957523358481561, + "grad_norm": 2.18985652923584, + "learning_rate": 4.367627785058978e-05, + "loss": 0.8398, + "step": 1333 + }, + { + "epoch": 0.02003047157516016, + "grad_norm": 1.9449113607406616, + "learning_rate": 4.469200524246396e-05, + "loss": 0.8304, + "step": 1364 + }, + { + "epoch": 0.020485709565504707, + "grad_norm": 2.0119357109069824, + "learning_rate": 4.570773263433814e-05, + "loss": 0.82, + "step": 1395 + }, + { + "epoch": 0.020940947555849257, + "grad_norm": 2.0013489723205566, + "learning_rate": 4.672346002621232e-05, + "loss": 0.8275, + "step": 1426 + }, + { + "epoch": 0.021396185546193807, + "grad_norm": 2.0175704956054688, + "learning_rate": 4.77391874180865e-05, + "loss": 0.8348, + "step": 1457 + }, + { + "epoch": 0.021851423536538357, + "grad_norm": 2.0555551052093506, + "learning_rate": 4.875491480996068e-05, + "loss": 0.8322, + "step": 1488 + }, + { + "epoch": 0.022306661526882904, + "grad_norm": 1.9871079921722412, + "learning_rate": 4.977064220183487e-05, + "loss": 0.8253, + "step": 1519 + }, + { + "epoch": 0.022761899517227454, + "grad_norm": 2.1991310119628906, + "learning_rate": 4.9999915451558777e-05, + "loss": 0.8252, + "step": 1550 + }, + { + "epoch": 0.023217137507572004, + "grad_norm": 1.96790611743927, + "learning_rate": 4.999955597496219e-05, + "loss": 0.8242, + "step": 1581 + }, + { + "epoch": 0.02367237549791655, + "grad_norm": 2.0572164058685303, + "learning_rate": 4.9998914381774255e-05, + "loss": 0.833, + "step": 1612 + }, + { + "epoch": 0.0241276134882611, + "grad_norm": 2.0381035804748535, + "learning_rate": 4.999799067923527e-05, + "loss": 0.8351, + "step": 1643 + }, + { + "epoch": 0.02458285147860565, + "grad_norm": 2.145235061645508, + "learning_rate": 4.999678487776908e-05, + "loss": 0.842, + "step": 1674 + }, + { + "epoch": 0.025038089468950198, + "grad_norm": 2.0120856761932373, + "learning_rate": 4.9995296990983006e-05, + "loss": 0.838, + "step": 1705 + }, + { + "epoch": 0.025493327459294748, + "grad_norm": 1.9461947679519653, + "learning_rate": 4.999352703566763e-05, + "loss": 0.8269, + "step": 1736 + }, + { + "epoch": 0.025948565449639298, + "grad_norm": 1.9189060926437378, + "learning_rate": 4.999147503179668e-05, + "loss": 0.8214, + "step": 1767 + }, + { + "epoch": 0.026403803439983848, + "grad_norm": 1.906604290008545, + "learning_rate": 4.998914100252672e-05, + "loss": 0.8249, + "step": 1798 + }, + { + "epoch": 0.026859041430328395, + "grad_norm": 2.0810904502868652, + "learning_rate": 4.998652497419696e-05, + "loss": 0.8143, + "step": 1829 + }, + { + "epoch": 0.027314279420672945, + "grad_norm": 1.9607690572738647, + "learning_rate": 4.9983626976328927e-05, + "loss": 0.8155, + "step": 1860 + }, + { + "epoch": 0.027769517411017495, + "grad_norm": 1.8492732048034668, + "learning_rate": 4.998044704162613e-05, + "loss": 0.8168, + "step": 1891 + }, + { + "epoch": 0.02822475540136204, + "grad_norm": 2.0616824626922607, + "learning_rate": 4.9976985205973705e-05, + "loss": 0.8279, + "step": 1922 + }, + { + "epoch": 0.02867999339170659, + "grad_norm": 1.882591724395752, + "learning_rate": 4.997324150843799e-05, + "loss": 0.8199, + "step": 1953 + }, + { + "epoch": 0.029135231382051142, + "grad_norm": 1.8001904487609863, + "learning_rate": 4.99692159912661e-05, + "loss": 0.8232, + "step": 1984 + }, + { + "epoch": 0.02959046937239569, + "grad_norm": 1.94768226146698, + "learning_rate": 4.996490869988546e-05, + "loss": 0.817, + "step": 2015 + }, + { + "epoch": 0.03004570736274024, + "grad_norm": 1.845452904701233, + "learning_rate": 4.996031968290326e-05, + "loss": 0.8146, + "step": 2046 + }, + { + "epoch": 0.03050094535308479, + "grad_norm": 1.8649152517318726, + "learning_rate": 4.995544899210594e-05, + "loss": 0.8246, + "step": 2077 + }, + { + "epoch": 0.03095618334342934, + "grad_norm": 1.665781021118164, + "learning_rate": 4.9950296682458583e-05, + "loss": 0.8004, + "step": 2108 + }, + { + "epoch": 0.031411421333773885, + "grad_norm": 1.6481060981750488, + "learning_rate": 4.994486281210429e-05, + "loss": 0.7949, + "step": 2139 + }, + { + "epoch": 0.03186665932411843, + "grad_norm": 1.7481175661087036, + "learning_rate": 4.9939147442363566e-05, + "loss": 0.8142, + "step": 2170 + }, + { + "epoch": 0.032321897314462986, + "grad_norm": 1.6130069494247437, + "learning_rate": 4.9933150637733574e-05, + "loss": 0.8126, + "step": 2201 + }, + { + "epoch": 0.03277713530480753, + "grad_norm": 1.7536373138427734, + "learning_rate": 4.992687246588743e-05, + "loss": 0.8149, + "step": 2232 + }, + { + "epoch": 0.033232373295152086, + "grad_norm": 1.721200942993164, + "learning_rate": 4.992031299767347e-05, + "loss": 0.8051, + "step": 2263 + }, + { + "epoch": 0.03368761128549663, + "grad_norm": 3.3350446224212646, + "learning_rate": 4.9913472307114386e-05, + "loss": 0.8007, + "step": 2294 + }, + { + "epoch": 0.03414284927584118, + "grad_norm": 1.848281741142273, + "learning_rate": 4.9906350471406446e-05, + "loss": 0.8076, + "step": 2325 + }, + { + "epoch": 0.03459808726618573, + "grad_norm": 1.6283164024353027, + "learning_rate": 4.989894757091861e-05, + "loss": 0.8084, + "step": 2356 + }, + { + "epoch": 0.03505332525653028, + "grad_norm": 1.6567096710205078, + "learning_rate": 4.989126368919158e-05, + "loss": 0.8075, + "step": 2387 + }, + { + "epoch": 0.035508563246874826, + "grad_norm": 1.6488817930221558, + "learning_rate": 4.988329891293693e-05, + "loss": 0.7994, + "step": 2418 + }, + { + "epoch": 0.03596380123721938, + "grad_norm": 1.9465230703353882, + "learning_rate": 4.987505333203608e-05, + "loss": 0.8064, + "step": 2449 + }, + { + "epoch": 0.036419039227563926, + "grad_norm": 1.722517967224121, + "learning_rate": 4.9866527039539276e-05, + "loss": 0.8005, + "step": 2480 + }, + { + "epoch": 0.03687427721790847, + "grad_norm": 1.8509281873703003, + "learning_rate": 4.9857720131664594e-05, + "loss": 0.8094, + "step": 2511 + }, + { + "epoch": 0.03732951520825303, + "grad_norm": 1.7140249013900757, + "learning_rate": 4.9848632707796773e-05, + "loss": 0.8018, + "step": 2542 + }, + { + "epoch": 0.03778475319859757, + "grad_norm": 1.9315160512924194, + "learning_rate": 4.9839264870486155e-05, + "loss": 0.8094, + "step": 2573 + }, + { + "epoch": 0.03823999118894212, + "grad_norm": 1.728063702583313, + "learning_rate": 4.9829616725447526e-05, + "loss": 0.8052, + "step": 2604 + }, + { + "epoch": 0.03869522917928667, + "grad_norm": 1.588205099105835, + "learning_rate": 4.981968838155888e-05, + "loss": 0.7983, + "step": 2635 + }, + { + "epoch": 0.03915046716963122, + "grad_norm": 1.6086421012878418, + "learning_rate": 4.980947995086024e-05, + "loss": 0.7961, + "step": 2666 + }, + { + "epoch": 0.03960570515997577, + "grad_norm": 1.680635929107666, + "learning_rate": 4.979899154855234e-05, + "loss": 0.8033, + "step": 2697 + }, + { + "epoch": 0.04006094315032032, + "grad_norm": 1.6833465099334717, + "learning_rate": 4.9788223292995386e-05, + "loss": 0.793, + "step": 2728 + }, + { + "epoch": 0.04051618114066487, + "grad_norm": 1.5738922357559204, + "learning_rate": 4.977717530570768e-05, + "loss": 0.7837, + "step": 2759 + }, + { + "epoch": 0.040971419131009414, + "grad_norm": 1.6513015031814575, + "learning_rate": 4.976584771136425e-05, + "loss": 0.7969, + "step": 2790 + }, + { + "epoch": 0.04142665712135397, + "grad_norm": 1.7180182933807373, + "learning_rate": 4.975424063779547e-05, + "loss": 0.7994, + "step": 2821 + }, + { + "epoch": 0.041881895111698514, + "grad_norm": 1.5236577987670898, + "learning_rate": 4.974235421598557e-05, + "loss": 0.7952, + "step": 2852 + }, + { + "epoch": 0.04233713310204307, + "grad_norm": 1.760188341140747, + "learning_rate": 4.973018858007122e-05, + "loss": 0.795, + "step": 2883 + }, + { + "epoch": 0.042792371092387614, + "grad_norm": 1.5314075946807861, + "learning_rate": 4.9717743867339963e-05, + "loss": 0.7884, + "step": 2914 + }, + { + "epoch": 0.04324760908273216, + "grad_norm": 1.7256252765655518, + "learning_rate": 4.9705020218228695e-05, + "loss": 0.7883, + "step": 2945 + }, + { + "epoch": 0.043702847073076714, + "grad_norm": 1.6262825727462769, + "learning_rate": 4.969201777632205e-05, + "loss": 0.8007, + "step": 2976 + }, + { + "epoch": 0.04415808506342126, + "grad_norm": 1.4939130544662476, + "learning_rate": 4.9678736688350846e-05, + "loss": 0.7862, + "step": 3007 + }, + { + "epoch": 0.04461332305376581, + "grad_norm": 1.6776609420776367, + "learning_rate": 4.966517710419033e-05, + "loss": 0.7977, + "step": 3038 + }, + { + "epoch": 0.04506856104411036, + "grad_norm": 1.7425099611282349, + "learning_rate": 4.965133917685858e-05, + "loss": 0.789, + "step": 3069 + }, + { + "epoch": 0.04552379903445491, + "grad_norm": 1.5114233493804932, + "learning_rate": 4.9637223062514714e-05, + "loss": 0.7915, + "step": 3100 + }, + { + "epoch": 0.045979037024799455, + "grad_norm": 1.6397335529327393, + "learning_rate": 4.962282892045718e-05, + "loss": 0.8127, + "step": 3131 + }, + { + "epoch": 0.04643427501514401, + "grad_norm": 1.7266038656234741, + "learning_rate": 4.9608156913121904e-05, + "loss": 0.827, + "step": 3162 + }, + { + "epoch": 0.046889513005488555, + "grad_norm": 1.5216429233551025, + "learning_rate": 4.959320720608049e-05, + "loss": 0.8146, + "step": 3193 + }, + { + "epoch": 0.0473447509958331, + "grad_norm": 1.9235813617706299, + "learning_rate": 4.9577979968038354e-05, + "loss": 0.8118, + "step": 3224 + }, + { + "epoch": 0.047799988986177655, + "grad_norm": 1.6768368482589722, + "learning_rate": 4.956247537083282e-05, + "loss": 0.7925, + "step": 3255 + }, + { + "epoch": 0.0482552269765222, + "grad_norm": 1.6357700824737549, + "learning_rate": 4.9546693589431145e-05, + "loss": 0.7743, + "step": 3286 + }, + { + "epoch": 0.04871046496686675, + "grad_norm": 1.6823676824569702, + "learning_rate": 4.9530634801928595e-05, + "loss": 0.7949, + "step": 3317 + }, + { + "epoch": 0.0491657029572113, + "grad_norm": 1.4716497659683228, + "learning_rate": 4.9514299189546395e-05, + "loss": 0.7755, + "step": 3348 + }, + { + "epoch": 0.04962094094755585, + "grad_norm": 3.1948273181915283, + "learning_rate": 4.949768693662973e-05, + "loss": 0.7867, + "step": 3379 + }, + { + "epoch": 0.050076178937900395, + "grad_norm": 1.7412703037261963, + "learning_rate": 4.948079823064559e-05, + "loss": 0.788, + "step": 3410 + }, + { + "epoch": 0.05053141692824495, + "grad_norm": 1.5544873476028442, + "learning_rate": 4.946363326218074e-05, + "loss": 0.7752, + "step": 3441 + }, + { + "epoch": 0.050986654918589495, + "grad_norm": 1.5616456270217896, + "learning_rate": 4.9446192224939525e-05, + "loss": 0.7752, + "step": 3472 + }, + { + "epoch": 0.05144189290893404, + "grad_norm": 1.4451948404312134, + "learning_rate": 4.942847531574167e-05, + "loss": 0.7786, + "step": 3503 + }, + { + "epoch": 0.051897130899278596, + "grad_norm": 1.5473634004592896, + "learning_rate": 4.941048273452008e-05, + "loss": 0.7824, + "step": 3534 + }, + { + "epoch": 0.05235236888962314, + "grad_norm": 1.4772653579711914, + "learning_rate": 4.9392214684318605e-05, + "loss": 0.7873, + "step": 3565 + }, + { + "epoch": 0.052807606879967696, + "grad_norm": 1.4708329439163208, + "learning_rate": 4.93736713712897e-05, + "loss": 0.7851, + "step": 3596 + }, + { + "epoch": 0.05326284487031224, + "grad_norm": 1.469369649887085, + "learning_rate": 4.9354853004692124e-05, + "loss": 0.7874, + "step": 3627 + }, + { + "epoch": 0.05371808286065679, + "grad_norm": 1.3600234985351562, + "learning_rate": 4.93357597968886e-05, + "loss": 0.7792, + "step": 3658 + }, + { + "epoch": 0.05417332085100134, + "grad_norm": 1.6479856967926025, + "learning_rate": 4.931639196334338e-05, + "loss": 0.7886, + "step": 3689 + }, + { + "epoch": 0.05462855884134589, + "grad_norm": 1.5576198101043701, + "learning_rate": 4.9296749722619826e-05, + "loss": 0.7873, + "step": 3720 + }, + { + "epoch": 0.055083796831690436, + "grad_norm": 1.5949997901916504, + "learning_rate": 4.9276833296377966e-05, + "loss": 0.7794, + "step": 3751 + }, + { + "epoch": 0.05553903482203499, + "grad_norm": 1.3033227920532227, + "learning_rate": 4.925664290937196e-05, + "loss": 0.7763, + "step": 3782 + }, + { + "epoch": 0.055994272812379536, + "grad_norm": 1.5083190202713013, + "learning_rate": 4.9236178789447576e-05, + "loss": 0.7796, + "step": 3813 + }, + { + "epoch": 0.05644951080272408, + "grad_norm": 1.4691548347473145, + "learning_rate": 4.921544116753962e-05, + "loss": 0.7744, + "step": 3844 + }, + { + "epoch": 0.05690474879306864, + "grad_norm": 1.4447206258773804, + "learning_rate": 4.919443027766935e-05, + "loss": 0.7812, + "step": 3875 + }, + { + "epoch": 0.05735998678341318, + "grad_norm": 1.4616878032684326, + "learning_rate": 4.91731463569418e-05, + "loss": 0.7824, + "step": 3906 + }, + { + "epoch": 0.05781522477375773, + "grad_norm": 1.4808290004730225, + "learning_rate": 4.915158964554312e-05, + "loss": 0.7777, + "step": 3937 + }, + { + "epoch": 0.058270462764102283, + "grad_norm": 1.4712798595428467, + "learning_rate": 4.912976038673786e-05, + "loss": 0.7801, + "step": 3968 + }, + { + "epoch": 0.05872570075444683, + "grad_norm": 1.4899425506591797, + "learning_rate": 4.9107658826866254e-05, + "loss": 0.7852, + "step": 3999 + }, + { + "epoch": 0.05918093874479138, + "grad_norm": 1.2921983003616333, + "learning_rate": 4.908528521534139e-05, + "loss": 0.7735, + "step": 4030 + }, + { + "epoch": 0.05963617673513593, + "grad_norm": 1.3849194049835205, + "learning_rate": 4.906263980464644e-05, + "loss": 0.7893, + "step": 4061 + }, + { + "epoch": 0.06009141472548048, + "grad_norm": 1.442808747291565, + "learning_rate": 4.903972285033178e-05, + "loss": 0.784, + "step": 4092 + }, + { + "epoch": 0.060546652715825024, + "grad_norm": 1.4925036430358887, + "learning_rate": 4.901653461101213e-05, + "loss": 0.7714, + "step": 4123 + }, + { + "epoch": 0.06100189070616958, + "grad_norm": 1.4630024433135986, + "learning_rate": 4.8993075348363626e-05, + "loss": 0.7677, + "step": 4154 + }, + { + "epoch": 0.061457128696514124, + "grad_norm": 1.344081163406372, + "learning_rate": 4.896934532712084e-05, + "loss": 0.7788, + "step": 4185 + }, + { + "epoch": 0.06191236668685868, + "grad_norm": 1.449648380279541, + "learning_rate": 4.8945344815073846e-05, + "loss": 0.7661, + "step": 4216 + }, + { + "epoch": 0.062367604677203224, + "grad_norm": 1.450361728668213, + "learning_rate": 4.892107408306516e-05, + "loss": 0.7698, + "step": 4247 + }, + { + "epoch": 0.06282284266754777, + "grad_norm": 1.5068191289901733, + "learning_rate": 4.889653340498669e-05, + "loss": 0.7758, + "step": 4278 + }, + { + "epoch": 0.06327808065789232, + "grad_norm": 1.3638824224472046, + "learning_rate": 4.8871723057776664e-05, + "loss": 0.7702, + "step": 4309 + }, + { + "epoch": 0.06373331864823686, + "grad_norm": 1.4669525623321533, + "learning_rate": 4.8846643321416476e-05, + "loss": 0.7755, + "step": 4340 + }, + { + "epoch": 0.06418855663858142, + "grad_norm": 1.502684235572815, + "learning_rate": 4.882129447892753e-05, + "loss": 0.7677, + "step": 4371 + }, + { + "epoch": 0.06464379462892597, + "grad_norm": 1.5910828113555908, + "learning_rate": 4.8795676816368076e-05, + "loss": 0.7646, + "step": 4402 + }, + { + "epoch": 0.06509903261927051, + "grad_norm": 1.4607537984848022, + "learning_rate": 4.876979062282995e-05, + "loss": 0.7684, + "step": 4433 + }, + { + "epoch": 0.06555427060961506, + "grad_norm": 1.3823063373565674, + "learning_rate": 4.8743636190435325e-05, + "loss": 0.7695, + "step": 4464 + }, + { + "epoch": 0.06600950859995962, + "grad_norm": 1.3769381046295166, + "learning_rate": 4.871721381433344e-05, + "loss": 0.7746, + "step": 4495 + }, + { + "epoch": 0.06646474659030417, + "grad_norm": 1.3154246807098389, + "learning_rate": 4.869052379269719e-05, + "loss": 0.7699, + "step": 4526 + }, + { + "epoch": 0.06691998458064871, + "grad_norm": 1.4395617246627808, + "learning_rate": 4.866356642671985e-05, + "loss": 0.7654, + "step": 4557 + }, + { + "epoch": 0.06737522257099327, + "grad_norm": 1.333095908164978, + "learning_rate": 4.8636342020611634e-05, + "loss": 0.7729, + "step": 4588 + }, + { + "epoch": 0.06783046056133782, + "grad_norm": 1.3427667617797852, + "learning_rate": 4.860885088159626e-05, + "loss": 0.7701, + "step": 4619 + }, + { + "epoch": 0.06828569855168236, + "grad_norm": 1.4095017910003662, + "learning_rate": 4.858109331990751e-05, + "loss": 0.7711, + "step": 4650 + }, + { + "epoch": 0.06874093654202691, + "grad_norm": 1.4199680089950562, + "learning_rate": 4.855306964878567e-05, + "loss": 0.7797, + "step": 4681 + }, + { + "epoch": 0.06919617453237147, + "grad_norm": 1.3505098819732666, + "learning_rate": 4.8524780184474084e-05, + "loss": 0.7702, + "step": 4712 + }, + { + "epoch": 0.069651412522716, + "grad_norm": 1.3765984773635864, + "learning_rate": 4.8496225246215496e-05, + "loss": 0.7647, + "step": 4743 + }, + { + "epoch": 0.07010665051306056, + "grad_norm": 1.539703369140625, + "learning_rate": 4.8467405156248505e-05, + "loss": 0.7673, + "step": 4774 + }, + { + "epoch": 0.07056188850340511, + "grad_norm": 1.6388696432113647, + "learning_rate": 4.843832023980392e-05, + "loss": 0.7672, + "step": 4805 + }, + { + "epoch": 0.07101712649374965, + "grad_norm": 1.8714436292648315, + "learning_rate": 4.840897082510106e-05, + "loss": 0.7661, + "step": 4836 + }, + { + "epoch": 0.0714723644840942, + "grad_norm": 1.5175424814224243, + "learning_rate": 4.8379357243344084e-05, + "loss": 0.7709, + "step": 4867 + }, + { + "epoch": 0.07192760247443876, + "grad_norm": 1.3103501796722412, + "learning_rate": 4.8349479828718236e-05, + "loss": 0.7701, + "step": 4898 + }, + { + "epoch": 0.0723828404647833, + "grad_norm": 1.4692376852035522, + "learning_rate": 4.8319338918386075e-05, + "loss": 0.7602, + "step": 4929 + }, + { + "epoch": 0.07283807845512785, + "grad_norm": 1.390416145324707, + "learning_rate": 4.828893485248369e-05, + "loss": 0.7896, + "step": 4960 + }, + { + "epoch": 0.0732933164454724, + "grad_norm": 1.4157830476760864, + "learning_rate": 4.825826797411682e-05, + "loss": 0.7675, + "step": 4991 + }, + { + "epoch": 0.07374855443581695, + "grad_norm": 1.2405292987823486, + "learning_rate": 4.822733862935702e-05, + "loss": 0.7679, + "step": 5022 + }, + { + "epoch": 0.0742037924261615, + "grad_norm": 1.4902536869049072, + "learning_rate": 4.819614716723775e-05, + "loss": 0.7693, + "step": 5053 + }, + { + "epoch": 0.07465903041650605, + "grad_norm": 1.4391045570373535, + "learning_rate": 4.8164693939750425e-05, + "loss": 0.7699, + "step": 5084 + }, + { + "epoch": 0.07511426840685059, + "grad_norm": 1.4432806968688965, + "learning_rate": 4.813297930184042e-05, + "loss": 0.763, + "step": 5115 + }, + { + "epoch": 0.07556950639719515, + "grad_norm": 1.3630146980285645, + "learning_rate": 4.810100361140314e-05, + "loss": 0.7618, + "step": 5146 + }, + { + "epoch": 0.0760247443875397, + "grad_norm": 1.5038788318634033, + "learning_rate": 4.8068767229279885e-05, + "loss": 0.7685, + "step": 5177 + }, + { + "epoch": 0.07647998237788424, + "grad_norm": 1.4100326299667358, + "learning_rate": 4.8036270519253854e-05, + "loss": 0.7645, + "step": 5208 + }, + { + "epoch": 0.0769352203682288, + "grad_norm": 1.377821445465088, + "learning_rate": 4.8003513848046e-05, + "loss": 0.7643, + "step": 5239 + }, + { + "epoch": 0.07739045835857335, + "grad_norm": 1.3848881721496582, + "learning_rate": 4.79704975853109e-05, + "loss": 0.7711, + "step": 5270 + }, + { + "epoch": 0.07784569634891789, + "grad_norm": 1.6327494382858276, + "learning_rate": 4.793722210363262e-05, + "loss": 0.7591, + "step": 5301 + }, + { + "epoch": 0.07830093433926244, + "grad_norm": 1.51273512840271, + "learning_rate": 4.7903687778520414e-05, + "loss": 0.7697, + "step": 5332 + }, + { + "epoch": 0.078756172329607, + "grad_norm": 1.447487235069275, + "learning_rate": 4.7869894988404593e-05, + "loss": 0.7602, + "step": 5363 + }, + { + "epoch": 0.07921141031995153, + "grad_norm": 1.3227125406265259, + "learning_rate": 4.783584411463221e-05, + "loss": 0.7616, + "step": 5394 + }, + { + "epoch": 0.07966664831029609, + "grad_norm": 1.4407910108566284, + "learning_rate": 4.780153554146274e-05, + "loss": 0.7618, + "step": 5425 + }, + { + "epoch": 0.08012188630064064, + "grad_norm": 1.2349289655685425, + "learning_rate": 4.7766969656063766e-05, + "loss": 0.7688, + "step": 5456 + }, + { + "epoch": 0.08057712429098518, + "grad_norm": 1.3858132362365723, + "learning_rate": 4.773214684850662e-05, + "loss": 0.7517, + "step": 5487 + }, + { + "epoch": 0.08103236228132973, + "grad_norm": 1.2951774597167969, + "learning_rate": 4.769706751176193e-05, + "loss": 0.7588, + "step": 5518 + }, + { + "epoch": 0.08148760027167429, + "grad_norm": 1.2295372486114502, + "learning_rate": 4.7661732041695264e-05, + "loss": 0.7522, + "step": 5549 + }, + { + "epoch": 0.08194283826201883, + "grad_norm": 1.4258685111999512, + "learning_rate": 4.762614083706258e-05, + "loss": 0.7666, + "step": 5580 + }, + { + "epoch": 0.08239807625236338, + "grad_norm": 1.5323545932769775, + "learning_rate": 4.759029429950581e-05, + "loss": 0.7587, + "step": 5611 + }, + { + "epoch": 0.08285331424270793, + "grad_norm": 1.3712198734283447, + "learning_rate": 4.7554192833548235e-05, + "loss": 0.7597, + "step": 5642 + }, + { + "epoch": 0.08330855223305247, + "grad_norm": 1.389145016670227, + "learning_rate": 4.751783684659e-05, + "loss": 0.7484, + "step": 5673 + }, + { + "epoch": 0.08376379022339703, + "grad_norm": 1.466799020767212, + "learning_rate": 4.748122674890348e-05, + "loss": 0.7586, + "step": 5704 + }, + { + "epoch": 0.08421902821374158, + "grad_norm": 1.4522569179534912, + "learning_rate": 4.7444362953628654e-05, + "loss": 0.7615, + "step": 5735 + }, + { + "epoch": 0.08467426620408613, + "grad_norm": 1.6374691724777222, + "learning_rate": 4.7407245876768424e-05, + "loss": 0.7573, + "step": 5766 + }, + { + "epoch": 0.08512950419443067, + "grad_norm": 1.384748101234436, + "learning_rate": 4.736987593718397e-05, + "loss": 0.7516, + "step": 5797 + }, + { + "epoch": 0.08558474218477523, + "grad_norm": 1.366335153579712, + "learning_rate": 4.733225355658999e-05, + "loss": 0.7681, + "step": 5828 + }, + { + "epoch": 0.08603998017511978, + "grad_norm": 1.3855708837509155, + "learning_rate": 4.7294379159549926e-05, + "loss": 0.7557, + "step": 5859 + }, + { + "epoch": 0.08649521816546432, + "grad_norm": 1.3614453077316284, + "learning_rate": 4.725625317347119e-05, + "loss": 0.7624, + "step": 5890 + }, + { + "epoch": 0.08695045615580888, + "grad_norm": 1.2072994709014893, + "learning_rate": 4.7217876028600374e-05, + "loss": 0.7545, + "step": 5921 + }, + { + "epoch": 0.08740569414615343, + "grad_norm": 1.5530472993850708, + "learning_rate": 4.717924815801832e-05, + "loss": 0.7574, + "step": 5952 + }, + { + "epoch": 0.08786093213649797, + "grad_norm": 1.3522703647613525, + "learning_rate": 4.714036999763532e-05, + "loss": 0.7608, + "step": 5983 + }, + { + "epoch": 0.08831617012684252, + "grad_norm": 1.342637300491333, + "learning_rate": 4.7101241986186116e-05, + "loss": 0.7519, + "step": 6014 + }, + { + "epoch": 0.08877140811718708, + "grad_norm": 1.2783684730529785, + "learning_rate": 4.7061864565225e-05, + "loss": 0.7565, + "step": 6045 + }, + { + "epoch": 0.08922664610753162, + "grad_norm": 1.1245291233062744, + "learning_rate": 4.702223817912081e-05, + "loss": 0.7655, + "step": 6076 + }, + { + "epoch": 0.08968188409787617, + "grad_norm": 1.2777146100997925, + "learning_rate": 4.698236327505195e-05, + "loss": 0.7526, + "step": 6107 + }, + { + "epoch": 0.09013712208822072, + "grad_norm": 1.2739795446395874, + "learning_rate": 4.694224030300127e-05, + "loss": 0.7594, + "step": 6138 + }, + { + "epoch": 0.09059236007856526, + "grad_norm": 1.358684778213501, + "learning_rate": 4.690186971575107e-05, + "loss": 0.7599, + "step": 6169 + }, + { + "epoch": 0.09104759806890982, + "grad_norm": 1.2908906936645508, + "learning_rate": 4.6861251968877916e-05, + "loss": 0.7564, + "step": 6200 + }, + { + "epoch": 0.09150283605925437, + "grad_norm": 1.2198729515075684, + "learning_rate": 4.68203875207476e-05, + "loss": 0.7595, + "step": 6231 + }, + { + "epoch": 0.09195807404959891, + "grad_norm": 1.337134599685669, + "learning_rate": 4.677927683250983e-05, + "loss": 0.7512, + "step": 6262 + }, + { + "epoch": 0.09241331203994346, + "grad_norm": 1.4034923315048218, + "learning_rate": 4.6737920368093156e-05, + "loss": 0.7587, + "step": 6293 + }, + { + "epoch": 0.09286855003028802, + "grad_norm": 1.2584336996078491, + "learning_rate": 4.669631859419965e-05, + "loss": 0.7532, + "step": 6324 + }, + { + "epoch": 0.09332378802063256, + "grad_norm": 1.4327120780944824, + "learning_rate": 4.6654471980299676e-05, + "loss": 0.7528, + "step": 6355 + }, + { + "epoch": 0.09377902601097711, + "grad_norm": 1.386634111404419, + "learning_rate": 4.661238099862658e-05, + "loss": 0.7455, + "step": 6386 + }, + { + "epoch": 0.09423426400132166, + "grad_norm": 1.373677372932434, + "learning_rate": 4.657004612417138e-05, + "loss": 0.7519, + "step": 6417 + }, + { + "epoch": 0.0946895019916662, + "grad_norm": 1.3846200704574585, + "learning_rate": 4.6527467834677374e-05, + "loss": 0.7505, + "step": 6448 + }, + { + "epoch": 0.09514473998201076, + "grad_norm": 1.3237924575805664, + "learning_rate": 4.648464661063478e-05, + "loss": 0.7562, + "step": 6479 + }, + { + "epoch": 0.09559997797235531, + "grad_norm": 1.3368539810180664, + "learning_rate": 4.6441582935275264e-05, + "loss": 0.7523, + "step": 6510 + }, + { + "epoch": 0.09605521596269985, + "grad_norm": 1.1844121217727661, + "learning_rate": 4.6398277294566586e-05, + "loss": 0.7535, + "step": 6541 + }, + { + "epoch": 0.0965104539530444, + "grad_norm": 1.3107521533966064, + "learning_rate": 4.6354730177207e-05, + "loss": 0.7609, + "step": 6572 + }, + { + "epoch": 0.09696569194338896, + "grad_norm": 1.3678165674209595, + "learning_rate": 4.6310942074619787e-05, + "loss": 0.7513, + "step": 6603 + }, + { + "epoch": 0.0974209299337335, + "grad_norm": 1.2169839143753052, + "learning_rate": 4.626691348094777e-05, + "loss": 0.7464, + "step": 6634 + }, + { + "epoch": 0.09787616792407805, + "grad_norm": 1.5456466674804688, + "learning_rate": 4.622264489304762e-05, + "loss": 0.7431, + "step": 6665 + }, + { + "epoch": 0.0983314059144226, + "grad_norm": 1.3606510162353516, + "learning_rate": 4.617813681048434e-05, + "loss": 0.7481, + "step": 6696 + }, + { + "epoch": 0.09878664390476714, + "grad_norm": 1.1854835748672485, + "learning_rate": 4.61333897355256e-05, + "loss": 0.7491, + "step": 6727 + }, + { + "epoch": 0.0992418818951117, + "grad_norm": 1.1682158708572388, + "learning_rate": 4.608840417313604e-05, + "loss": 0.7433, + "step": 6758 + }, + { + "epoch": 0.09969711988545625, + "grad_norm": 1.3889473676681519, + "learning_rate": 4.6043180630971646e-05, + "loss": 0.7414, + "step": 6789 + }, + { + "epoch": 0.10015235787580079, + "grad_norm": 1.351807713508606, + "learning_rate": 4.599771961937391e-05, + "loss": 0.7501, + "step": 6820 + }, + { + "epoch": 0.10060759586614534, + "grad_norm": 1.4737526178359985, + "learning_rate": 4.5952021651364204e-05, + "loss": 0.7508, + "step": 6851 + }, + { + "epoch": 0.1010628338564899, + "grad_norm": 1.3223670721054077, + "learning_rate": 4.590608724263786e-05, + "loss": 0.7557, + "step": 6882 + }, + { + "epoch": 0.10151807184683444, + "grad_norm": 1.182531714439392, + "learning_rate": 4.585991691155845e-05, + "loss": 0.7595, + "step": 6913 + }, + { + "epoch": 0.10197330983717899, + "grad_norm": 1.3309706449508667, + "learning_rate": 4.581351117915188e-05, + "loss": 0.7516, + "step": 6944 + }, + { + "epoch": 0.10242854782752354, + "grad_norm": 1.1631003618240356, + "learning_rate": 4.5766870569100534e-05, + "loss": 0.7538, + "step": 6975 + }, + { + "epoch": 0.10288378581786808, + "grad_norm": 1.150242805480957, + "learning_rate": 4.571999560773736e-05, + "loss": 0.7583, + "step": 7006 + }, + { + "epoch": 0.10333902380821264, + "grad_norm": 1.2031753063201904, + "learning_rate": 4.5672886824039915e-05, + "loss": 0.743, + "step": 7037 + }, + { + "epoch": 0.10379426179855719, + "grad_norm": 1.3973835706710815, + "learning_rate": 4.5625544749624435e-05, + "loss": 0.7477, + "step": 7068 + }, + { + "epoch": 0.10424949978890174, + "grad_norm": 1.3755607604980469, + "learning_rate": 4.5577969918739794e-05, + "loss": 0.7424, + "step": 7099 + }, + { + "epoch": 0.10470473777924628, + "grad_norm": 1.2959696054458618, + "learning_rate": 4.5530162868261486e-05, + "loss": 0.7432, + "step": 7130 + }, + { + "epoch": 0.10515997576959084, + "grad_norm": 1.4843213558197021, + "learning_rate": 4.548212413768558e-05, + "loss": 0.7519, + "step": 7161 + }, + { + "epoch": 0.10561521375993539, + "grad_norm": 1.3697874546051025, + "learning_rate": 4.543385426912261e-05, + "loss": 0.7546, + "step": 7192 + }, + { + "epoch": 0.10607045175027993, + "grad_norm": 1.3163423538208008, + "learning_rate": 4.53853538072915e-05, + "loss": 0.7328, + "step": 7223 + }, + { + "epoch": 0.10652568974062449, + "grad_norm": 1.200897455215454, + "learning_rate": 4.533662329951336e-05, + "loss": 0.7438, + "step": 7254 + }, + { + "epoch": 0.10698092773096904, + "grad_norm": 1.3377361297607422, + "learning_rate": 4.528766329570536e-05, + "loss": 0.7478, + "step": 7285 + }, + { + "epoch": 0.10743616572131358, + "grad_norm": 1.2304787635803223, + "learning_rate": 4.523847434837447e-05, + "loss": 0.7468, + "step": 7316 + }, + { + "epoch": 0.10789140371165813, + "grad_norm": 1.1947201490402222, + "learning_rate": 4.518905701261128e-05, + "loss": 0.7483, + "step": 7347 + }, + { + "epoch": 0.10834664170200269, + "grad_norm": 1.1942620277404785, + "learning_rate": 4.5139411846083715e-05, + "loss": 0.7522, + "step": 7378 + }, + { + "epoch": 0.10880187969234723, + "grad_norm": 1.2765998840332031, + "learning_rate": 4.508953940903073e-05, + "loss": 0.7425, + "step": 7409 + }, + { + "epoch": 0.10925711768269178, + "grad_norm": 1.1460707187652588, + "learning_rate": 4.5039440264255994e-05, + "loss": 0.7534, + "step": 7440 + }, + { + "epoch": 0.10971235567303633, + "grad_norm": 1.1739740371704102, + "learning_rate": 4.498911497712155e-05, + "loss": 0.7447, + "step": 7471 + }, + { + "epoch": 0.11016759366338087, + "grad_norm": 1.227232575416565, + "learning_rate": 4.493856411554142e-05, + "loss": 0.7452, + "step": 7502 + }, + { + "epoch": 0.11062283165372543, + "grad_norm": 1.3812710046768188, + "learning_rate": 4.4887788249975206e-05, + "loss": 0.7431, + "step": 7533 + }, + { + "epoch": 0.11107806964406998, + "grad_norm": 1.1743565797805786, + "learning_rate": 4.4836787953421656e-05, + "loss": 0.7421, + "step": 7564 + }, + { + "epoch": 0.11153330763441452, + "grad_norm": 1.2436307668685913, + "learning_rate": 4.478556380141218e-05, + "loss": 0.7461, + "step": 7595 + }, + { + "epoch": 0.11198854562475907, + "grad_norm": 1.2472264766693115, + "learning_rate": 4.4734116372004375e-05, + "loss": 0.7456, + "step": 7626 + }, + { + "epoch": 0.11244378361510363, + "grad_norm": 1.2209473848342896, + "learning_rate": 4.4682446245775477e-05, + "loss": 0.7544, + "step": 7657 + }, + { + "epoch": 0.11289902160544817, + "grad_norm": 1.3582866191864014, + "learning_rate": 4.463055400581586e-05, + "loss": 0.7464, + "step": 7688 + }, + { + "epoch": 0.11335425959579272, + "grad_norm": 1.2489479780197144, + "learning_rate": 4.4578440237722374e-05, + "loss": 0.7394, + "step": 7719 + }, + { + "epoch": 0.11380949758613727, + "grad_norm": 1.2384752035140991, + "learning_rate": 4.452610552959183e-05, + "loss": 0.7358, + "step": 7750 + }, + { + "epoch": 0.11426473557648181, + "grad_norm": 1.3618046045303345, + "learning_rate": 4.447355047201428e-05, + "loss": 0.742, + "step": 7781 + }, + { + "epoch": 0.11471997356682637, + "grad_norm": 1.2598398923873901, + "learning_rate": 4.4420775658066414e-05, + "loss": 0.7407, + "step": 7812 + }, + { + "epoch": 0.11517521155717092, + "grad_norm": 1.1453444957733154, + "learning_rate": 4.436778168330484e-05, + "loss": 0.7499, + "step": 7843 + }, + { + "epoch": 0.11563044954751546, + "grad_norm": 1.3463783264160156, + "learning_rate": 4.4314569145759353e-05, + "loss": 0.7388, + "step": 7874 + }, + { + "epoch": 0.11608568753786001, + "grad_norm": 1.2431267499923706, + "learning_rate": 4.42611386459262e-05, + "loss": 0.7487, + "step": 7905 + }, + { + "epoch": 0.11654092552820457, + "grad_norm": 1.2726207971572876, + "learning_rate": 4.420749078676133e-05, + "loss": 0.7468, + "step": 7936 + }, + { + "epoch": 0.1169961635185491, + "grad_norm": 1.1979873180389404, + "learning_rate": 4.4153626173673516e-05, + "loss": 0.7482, + "step": 7967 + }, + { + "epoch": 0.11745140150889366, + "grad_norm": 1.2321964502334595, + "learning_rate": 4.409954541451762e-05, + "loss": 0.7363, + "step": 7998 + }, + { + "epoch": 0.11790663949923821, + "grad_norm": 1.265199065208435, + "learning_rate": 4.404524911958764e-05, + "loss": 0.7466, + "step": 8029 + }, + { + "epoch": 0.11836187748958275, + "grad_norm": 1.3418115377426147, + "learning_rate": 4.399073790160989e-05, + "loss": 0.7433, + "step": 8060 + }, + { + "epoch": 0.11881711547992731, + "grad_norm": 1.276666522026062, + "learning_rate": 4.393601237573607e-05, + "loss": 0.7457, + "step": 8091 + }, + { + "epoch": 0.11927235347027186, + "grad_norm": 1.3559373617172241, + "learning_rate": 4.388107315953628e-05, + "loss": 0.7451, + "step": 8122 + }, + { + "epoch": 0.1197275914606164, + "grad_norm": 1.3017044067382812, + "learning_rate": 4.382592087299212e-05, + "loss": 0.736, + "step": 8153 + }, + { + "epoch": 0.12018282945096095, + "grad_norm": 1.230873703956604, + "learning_rate": 4.377055613848964e-05, + "loss": 0.7409, + "step": 8184 + }, + { + "epoch": 0.12063806744130551, + "grad_norm": 1.3415979146957397, + "learning_rate": 4.3714979580812355e-05, + "loss": 0.7456, + "step": 8215 + }, + { + "epoch": 0.12109330543165005, + "grad_norm": 1.3706104755401611, + "learning_rate": 4.365919182713416e-05, + "loss": 0.7425, + "step": 8246 + }, + { + "epoch": 0.1215485434219946, + "grad_norm": 1.3071256875991821, + "learning_rate": 4.360319350701226e-05, + "loss": 0.7312, + "step": 8277 + }, + { + "epoch": 0.12200378141233915, + "grad_norm": 1.1925092935562134, + "learning_rate": 4.3546985252380115e-05, + "loss": 0.7457, + "step": 8308 + }, + { + "epoch": 0.12245901940268371, + "grad_norm": 1.2254852056503296, + "learning_rate": 4.349056769754021e-05, + "loss": 0.7454, + "step": 8339 + }, + { + "epoch": 0.12291425739302825, + "grad_norm": 1.2121927738189697, + "learning_rate": 4.3433941479156994e-05, + "loss": 0.7451, + "step": 8370 + }, + { + "epoch": 0.1233694953833728, + "grad_norm": 1.4518349170684814, + "learning_rate": 4.3377107236249647e-05, + "loss": 0.7376, + "step": 8401 + }, + { + "epoch": 0.12382473337371736, + "grad_norm": 1.2487318515777588, + "learning_rate": 4.332006561018488e-05, + "loss": 0.7468, + "step": 8432 + }, + { + "epoch": 0.1242799713640619, + "grad_norm": 1.2601664066314697, + "learning_rate": 4.3262817244669683e-05, + "loss": 0.7341, + "step": 8463 + }, + { + "epoch": 0.12473520935440645, + "grad_norm": 2.2060904502868652, + "learning_rate": 4.3205362785744083e-05, + "loss": 0.7421, + "step": 8494 + }, + { + "epoch": 0.125190447344751, + "grad_norm": 1.2388501167297363, + "learning_rate": 4.314770288177384e-05, + "loss": 0.7372, + "step": 8525 + }, + { + "epoch": 0.12564568533509554, + "grad_norm": 1.20314359664917, + "learning_rate": 4.308983818344313e-05, + "loss": 0.7308, + "step": 8556 + }, + { + "epoch": 0.1261009233254401, + "grad_norm": 1.2387797832489014, + "learning_rate": 4.3031769343747206e-05, + "loss": 0.7364, + "step": 8587 + }, + { + "epoch": 0.12655616131578465, + "grad_norm": 1.2150596380233765, + "learning_rate": 4.297349701798505e-05, + "loss": 0.7339, + "step": 8618 + }, + { + "epoch": 0.1270113993061292, + "grad_norm": 1.3160388469696045, + "learning_rate": 4.2915021863751916e-05, + "loss": 0.7448, + "step": 8649 + }, + { + "epoch": 0.12746663729647373, + "grad_norm": 1.2065149545669556, + "learning_rate": 4.285634454093198e-05, + "loss": 0.7373, + "step": 8680 + }, + { + "epoch": 0.12792187528681828, + "grad_norm": 1.2958154678344727, + "learning_rate": 4.279746571169086e-05, + "loss": 0.7282, + "step": 8711 + }, + { + "epoch": 0.12837711327716284, + "grad_norm": 1.2723522186279297, + "learning_rate": 4.2738386040468136e-05, + "loss": 0.7274, + "step": 8742 + }, + { + "epoch": 0.1288323512675074, + "grad_norm": 1.1709808111190796, + "learning_rate": 4.2679106193969866e-05, + "loss": 0.7459, + "step": 8773 + }, + { + "epoch": 0.12928758925785194, + "grad_norm": 1.3450511693954468, + "learning_rate": 4.261962684116106e-05, + "loss": 0.7334, + "step": 8804 + }, + { + "epoch": 0.1297428272481965, + "grad_norm": 1.3592984676361084, + "learning_rate": 4.2559948653258145e-05, + "loss": 0.7415, + "step": 8835 + }, + { + "epoch": 0.13019806523854102, + "grad_norm": 1.2315089702606201, + "learning_rate": 4.250007230372134e-05, + "loss": 0.7296, + "step": 8866 + }, + { + "epoch": 0.13065330322888558, + "grad_norm": 1.0693110227584839, + "learning_rate": 4.2439998468247126e-05, + "loss": 0.7269, + "step": 8897 + }, + { + "epoch": 0.13110854121923013, + "grad_norm": 1.0341922044754028, + "learning_rate": 4.2379727824760566e-05, + "loss": 0.7257, + "step": 8928 + }, + { + "epoch": 0.13156377920957468, + "grad_norm": 1.3500678539276123, + "learning_rate": 4.231926105340768e-05, + "loss": 0.7379, + "step": 8959 + }, + { + "epoch": 0.13201901719991924, + "grad_norm": 1.3482787609100342, + "learning_rate": 4.225859883654776e-05, + "loss": 0.7393, + "step": 8990 + }, + { + "epoch": 0.1324742551902638, + "grad_norm": 1.3126060962677002, + "learning_rate": 4.219774185874569e-05, + "loss": 0.7391, + "step": 9021 + }, + { + "epoch": 0.13292949318060834, + "grad_norm": 1.2438606023788452, + "learning_rate": 4.213669080676418e-05, + "loss": 0.7347, + "step": 9052 + }, + { + "epoch": 0.13338473117095287, + "grad_norm": 1.3756070137023926, + "learning_rate": 4.2075446369556056e-05, + "loss": 0.7321, + "step": 9083 + }, + { + "epoch": 0.13383996916129742, + "grad_norm": 1.189840316772461, + "learning_rate": 4.201400923825648e-05, + "loss": 0.7352, + "step": 9114 + }, + { + "epoch": 0.13429520715164198, + "grad_norm": 1.1982426643371582, + "learning_rate": 4.195238010617511e-05, + "loss": 0.7388, + "step": 9145 + } + ], + "logging_steps": 31, + "max_steps": 30517, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 3052, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 6.754968236539773e+18, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-9156/training_args.bin b/checkpoint-9156/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..dd0c30645908fd59ad9d6091c5e2e08018856b49 --- /dev/null +++ b/checkpoint-9156/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2d2ae676cb4770c8405bd6fc6962f1ef1289d669b20872c9a947682a2673db22 +size 5304 diff --git a/config.json b/config.json new file mode 100644 index 0000000000000000000000000000000000000000..1a2e378a7091e0c0c8eaa10d0bd7796835fe4b64 --- /dev/null +++ b/config.json @@ -0,0 +1,39 @@ +{ + "_name_or_path": "meta-llama/Llama-3.1-8B-Instruct", + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 8.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "torch_dtype": "float32", + "transformers_version": "4.44.0.dev0", + "use_cache": true, + "vocab_size": 128256 +} diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c0d952a3697e90a24460d766a494a8bb9cf80534 --- /dev/null +++ b/generation_config.json @@ -0,0 +1,12 @@ +{ + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.44.0.dev0" +} diff --git a/model-00001-of-00007.safetensors b/model-00001-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..e32d3fb1ab3b688aa85ce479a51c3b384c12e8c5 --- /dev/null +++ b/model-00001-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:11a359b750f6ab7df9c886897f4c9a6e9809f7b3ff21fc7d4d310c85a52c98c5 +size 4886466168 diff --git a/model-00002-of-00007.safetensors b/model-00002-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..fbc8e97904ed5acdba6240901a52d7ea3c73628d --- /dev/null +++ b/model-00002-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:23fa5aaf8ac56c17c821980c9aee44fcbc2a617fc570e875e6fdb93d92886c72 +size 4832007448 diff --git a/model-00003-of-00007.safetensors b/model-00003-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..aeab8b855338147792249baf9c680bfb29ea42a5 --- /dev/null +++ b/model-00003-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a223e87240e50dfb05634ea909cf1ec715d37a12b681b14a3a6e97f28688f2b +size 4999813112 diff --git a/model-00004-of-00007.safetensors b/model-00004-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ac9e7b6b5edbe63eec2dd4c7d3310db1f10fdda9 --- /dev/null +++ b/model-00004-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3047d199c611c6d12b1cca780f033ad89df9c076f742674f7ac4a892ca9573b3 +size 4999813128 diff --git a/model-00005-of-00007.safetensors b/model-00005-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..48718503a952853ffd5e231b9dba74b7026e0e01 --- /dev/null +++ b/model-00005-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a2f91d749d664ecd22c71f3d54488966fc6899a03924b8fea3613c8f83398a8 +size 4832007496 diff --git a/model-00006-of-00007.safetensors b/model-00006-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..0f9d24d60f542bdc40d1b63728e4f4807c43a53e --- /dev/null +++ b/model-00006-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:80d44e66e2e18efa285692398241c7f447826a9503eaa614a71d6417684a5ab2 +size 4999813120 diff --git a/model-00007-of-00007.safetensors b/model-00007-of-00007.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..41844e746d4850e8773a61a7aebc7da8fb5ce539 --- /dev/null +++ b/model-00007-of-00007.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e5a3b99186448cc9d7508459f20329d6ec2ec7647dfcdb4c5e216066eadf5bb6 +size 2571158184 diff --git a/model.safetensors.index.json b/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..0985084320f65a678bea622edb50d3771af4fd13 --- /dev/null +++ b/model.safetensors.index.json @@ -0,0 +1,298 @@ +{ + "metadata": { + "total_size": 32121044992 + }, + "weight_map": { + "lm_head.weight": "model-00007-of-00007.safetensors", + "model.embed_tokens.weight": "model-00001-of-00007.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.10.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.15.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.input_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.20.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00004-of-00007.safetensors", + "model.layers.21.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.input_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00005-of-00007.safetensors", + "model.layers.26.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.3.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00007.safetensors", + "model.layers.30.input_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.input_layernorm.weight": "model-00007-of-00007.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00007-of-00007.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00007-of-00007.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00007-of-00007.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00006-of-00007.safetensors", + "model.layers.4.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.input_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00007.safetensors", + "model.layers.9.input_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00007.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00003-of-00007.safetensors", + "model.norm.weight": "model-00007-of-00007.safetensors" + } +} diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..5cc5f00a5b203e90a27a3bd60d1ec393b07971e8 --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,410563 @@ +{ + "version": "1.0", + "truncation": null, + "padding": null, + "added_tokens": [ + { + "id": 128000, + "content": "<|begin_of_text|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128001, + "content": "<|end_of_text|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128002, + "content": "<|reserved_special_token_0|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128003, + "content": "<|reserved_special_token_1|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128004, + "content": "<|finetune_right_pad_id|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128005, + "content": "<|reserved_special_token_2|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128006, + "content": "<|start_header_id|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128007, + "content": "<|end_header_id|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128008, + "content": "<|eom_id|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128009, + "content": "<|eot_id|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128010, + "content": "<|python_tag|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128011, + "content": "<|reserved_special_token_3|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128012, + "content": "<|reserved_special_token_4|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128013, + "content": "<|reserved_special_token_5|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128014, + "content": "<|reserved_special_token_6|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128015, + "content": "<|reserved_special_token_7|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128016, + "content": "<|reserved_special_token_8|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128017, + "content": "<|reserved_special_token_9|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128018, + "content": "<|reserved_special_token_10|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128019, + "content": "<|reserved_special_token_11|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128020, + "content": "<|reserved_special_token_12|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128021, + "content": "<|reserved_special_token_13|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128022, + "content": "<|reserved_special_token_14|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128023, + "content": "<|reserved_special_token_15|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128024, + "content": "<|reserved_special_token_16|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128025, + "content": "<|reserved_special_token_17|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128026, + "content": "<|reserved_special_token_18|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128027, + "content": "<|reserved_special_token_19|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128028, + "content": "<|reserved_special_token_20|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128029, + "content": "<|reserved_special_token_21|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128030, + "content": "<|reserved_special_token_22|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128031, + "content": "<|reserved_special_token_23|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128032, + "content": "<|reserved_special_token_24|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128033, + "content": "<|reserved_special_token_25|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128034, + "content": "<|reserved_special_token_26|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128035, + "content": "<|reserved_special_token_27|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128036, + "content": "<|reserved_special_token_28|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128037, + "content": "<|reserved_special_token_29|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128038, + "content": "<|reserved_special_token_30|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128039, + "content": "<|reserved_special_token_31|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128040, + "content": "<|reserved_special_token_32|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128041, + "content": "<|reserved_special_token_33|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128042, + "content": "<|reserved_special_token_34|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128043, + "content": "<|reserved_special_token_35|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128044, + "content": "<|reserved_special_token_36|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128045, + "content": "<|reserved_special_token_37|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128046, + "content": "<|reserved_special_token_38|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128047, + "content": "<|reserved_special_token_39|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128048, + "content": "<|reserved_special_token_40|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128049, + "content": "<|reserved_special_token_41|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128050, + "content": "<|reserved_special_token_42|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128051, + "content": "<|reserved_special_token_43|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128052, + "content": "<|reserved_special_token_44|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128053, + "content": "<|reserved_special_token_45|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128054, + "content": "<|reserved_special_token_46|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128055, + "content": "<|reserved_special_token_47|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128056, + "content": "<|reserved_special_token_48|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128057, + "content": "<|reserved_special_token_49|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128058, + "content": "<|reserved_special_token_50|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128059, + "content": "<|reserved_special_token_51|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128060, + "content": "<|reserved_special_token_52|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128061, + "content": "<|reserved_special_token_53|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128062, + "content": "<|reserved_special_token_54|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128063, + "content": "<|reserved_special_token_55|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128064, + "content": "<|reserved_special_token_56|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128065, + "content": "<|reserved_special_token_57|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128066, + "content": "<|reserved_special_token_58|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128067, + "content": "<|reserved_special_token_59|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128068, + "content": "<|reserved_special_token_60|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128069, + "content": "<|reserved_special_token_61|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128070, + "content": "<|reserved_special_token_62|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128071, + "content": "<|reserved_special_token_63|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128072, + "content": "<|reserved_special_token_64|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128073, + "content": "<|reserved_special_token_65|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128074, + "content": "<|reserved_special_token_66|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128075, + "content": "<|reserved_special_token_67|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128076, + "content": "<|reserved_special_token_68|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128077, + "content": "<|reserved_special_token_69|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128078, + "content": "<|reserved_special_token_70|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128079, + "content": "<|reserved_special_token_71|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128080, + "content": "<|reserved_special_token_72|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128081, + "content": "<|reserved_special_token_73|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128082, + "content": "<|reserved_special_token_74|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128083, + "content": "<|reserved_special_token_75|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128084, + "content": "<|reserved_special_token_76|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128085, + "content": "<|reserved_special_token_77|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128086, + "content": "<|reserved_special_token_78|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128087, + "content": "<|reserved_special_token_79|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128088, + "content": "<|reserved_special_token_80|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128089, + "content": "<|reserved_special_token_81|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128090, + "content": "<|reserved_special_token_82|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128091, + "content": "<|reserved_special_token_83|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128092, + "content": "<|reserved_special_token_84|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128093, + "content": "<|reserved_special_token_85|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128094, + "content": "<|reserved_special_token_86|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128095, + "content": "<|reserved_special_token_87|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128096, + "content": "<|reserved_special_token_88|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128097, + "content": "<|reserved_special_token_89|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128098, + "content": "<|reserved_special_token_90|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128099, + "content": "<|reserved_special_token_91|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128100, + "content": "<|reserved_special_token_92|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128101, + "content": "<|reserved_special_token_93|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128102, + "content": "<|reserved_special_token_94|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128103, + "content": "<|reserved_special_token_95|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128104, + "content": "<|reserved_special_token_96|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128105, + "content": "<|reserved_special_token_97|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128106, + "content": "<|reserved_special_token_98|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128107, + "content": "<|reserved_special_token_99|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128108, + "content": "<|reserved_special_token_100|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128109, + "content": "<|reserved_special_token_101|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128110, + "content": "<|reserved_special_token_102|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128111, + "content": "<|reserved_special_token_103|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128112, + "content": "<|reserved_special_token_104|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128113, + "content": "<|reserved_special_token_105|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128114, + "content": "<|reserved_special_token_106|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128115, + "content": "<|reserved_special_token_107|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128116, + "content": "<|reserved_special_token_108|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128117, + "content": "<|reserved_special_token_109|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128118, + "content": "<|reserved_special_token_110|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128119, + "content": "<|reserved_special_token_111|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128120, + "content": "<|reserved_special_token_112|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128121, + "content": "<|reserved_special_token_113|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128122, + "content": "<|reserved_special_token_114|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128123, + "content": "<|reserved_special_token_115|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128124, + "content": "<|reserved_special_token_116|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128125, + "content": "<|reserved_special_token_117|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128126, + "content": "<|reserved_special_token_118|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128127, + "content": "<|reserved_special_token_119|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128128, + "content": "<|reserved_special_token_120|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128129, + "content": "<|reserved_special_token_121|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128130, + "content": "<|reserved_special_token_122|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128131, + "content": "<|reserved_special_token_123|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128132, + "content": "<|reserved_special_token_124|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128133, + "content": "<|reserved_special_token_125|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128134, + "content": "<|reserved_special_token_126|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128135, + "content": "<|reserved_special_token_127|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128136, + "content": "<|reserved_special_token_128|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128137, + "content": "<|reserved_special_token_129|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128138, + "content": "<|reserved_special_token_130|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128139, + "content": "<|reserved_special_token_131|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128140, + "content": "<|reserved_special_token_132|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128141, + "content": "<|reserved_special_token_133|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128142, + "content": "<|reserved_special_token_134|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128143, + "content": "<|reserved_special_token_135|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128144, + "content": "<|reserved_special_token_136|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128145, + "content": "<|reserved_special_token_137|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128146, + "content": "<|reserved_special_token_138|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128147, + "content": "<|reserved_special_token_139|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128148, + "content": "<|reserved_special_token_140|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128149, + "content": "<|reserved_special_token_141|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128150, + "content": "<|reserved_special_token_142|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128151, + "content": "<|reserved_special_token_143|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128152, + "content": "<|reserved_special_token_144|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128153, + "content": "<|reserved_special_token_145|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128154, + "content": "<|reserved_special_token_146|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128155, + "content": "<|reserved_special_token_147|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128156, + "content": "<|reserved_special_token_148|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128157, + "content": "<|reserved_special_token_149|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128158, + "content": "<|reserved_special_token_150|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128159, + "content": "<|reserved_special_token_151|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128160, + "content": "<|reserved_special_token_152|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128161, + "content": "<|reserved_special_token_153|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128162, + "content": "<|reserved_special_token_154|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128163, + "content": "<|reserved_special_token_155|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128164, + "content": "<|reserved_special_token_156|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128165, + "content": "<|reserved_special_token_157|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128166, + "content": "<|reserved_special_token_158|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128167, + "content": "<|reserved_special_token_159|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128168, + "content": "<|reserved_special_token_160|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128169, + "content": "<|reserved_special_token_161|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128170, + "content": "<|reserved_special_token_162|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128171, + "content": "<|reserved_special_token_163|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128172, + "content": "<|reserved_special_token_164|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128173, + "content": "<|reserved_special_token_165|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128174, + "content": "<|reserved_special_token_166|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128175, + "content": "<|reserved_special_token_167|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128176, + "content": "<|reserved_special_token_168|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128177, + "content": "<|reserved_special_token_169|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128178, + "content": "<|reserved_special_token_170|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128179, + "content": "<|reserved_special_token_171|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128180, + "content": "<|reserved_special_token_172|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128181, + "content": "<|reserved_special_token_173|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128182, + "content": "<|reserved_special_token_174|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128183, + "content": "<|reserved_special_token_175|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128184, + "content": "<|reserved_special_token_176|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128185, + "content": "<|reserved_special_token_177|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128186, + "content": "<|reserved_special_token_178|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128187, + "content": "<|reserved_special_token_179|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128188, + "content": "<|reserved_special_token_180|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128189, + "content": "<|reserved_special_token_181|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128190, + "content": "<|reserved_special_token_182|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128191, + "content": "<|reserved_special_token_183|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128192, + "content": "<|reserved_special_token_184|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128193, + "content": "<|reserved_special_token_185|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128194, + "content": "<|reserved_special_token_186|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128195, + "content": "<|reserved_special_token_187|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128196, + "content": "<|reserved_special_token_188|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128197, + "content": "<|reserved_special_token_189|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128198, + "content": "<|reserved_special_token_190|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128199, + "content": "<|reserved_special_token_191|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128200, + "content": "<|reserved_special_token_192|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128201, + "content": "<|reserved_special_token_193|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128202, + "content": "<|reserved_special_token_194|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128203, + "content": "<|reserved_special_token_195|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128204, + "content": "<|reserved_special_token_196|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128205, + "content": "<|reserved_special_token_197|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128206, + "content": "<|reserved_special_token_198|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128207, + "content": "<|reserved_special_token_199|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128208, + "content": "<|reserved_special_token_200|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128209, + "content": "<|reserved_special_token_201|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128210, + "content": "<|reserved_special_token_202|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128211, + "content": "<|reserved_special_token_203|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128212, + "content": "<|reserved_special_token_204|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128213, + "content": "<|reserved_special_token_205|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128214, + "content": "<|reserved_special_token_206|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128215, + "content": "<|reserved_special_token_207|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128216, + "content": "<|reserved_special_token_208|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128217, + "content": "<|reserved_special_token_209|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128218, + "content": "<|reserved_special_token_210|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128219, + "content": "<|reserved_special_token_211|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128220, + "content": "<|reserved_special_token_212|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128221, + "content": "<|reserved_special_token_213|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128222, + "content": "<|reserved_special_token_214|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128223, + "content": "<|reserved_special_token_215|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128224, + "content": "<|reserved_special_token_216|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128225, + "content": "<|reserved_special_token_217|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128226, + "content": "<|reserved_special_token_218|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128227, + "content": "<|reserved_special_token_219|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128228, + "content": "<|reserved_special_token_220|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128229, + "content": "<|reserved_special_token_221|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128230, + "content": "<|reserved_special_token_222|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128231, + "content": "<|reserved_special_token_223|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128232, + "content": "<|reserved_special_token_224|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128233, + "content": "<|reserved_special_token_225|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128234, + "content": "<|reserved_special_token_226|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128235, + "content": "<|reserved_special_token_227|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128236, + "content": "<|reserved_special_token_228|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128237, + "content": "<|reserved_special_token_229|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128238, + "content": "<|reserved_special_token_230|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128239, + "content": "<|reserved_special_token_231|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128240, + "content": "<|reserved_special_token_232|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128241, + "content": "<|reserved_special_token_233|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128242, + "content": "<|reserved_special_token_234|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128243, + "content": "<|reserved_special_token_235|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128244, + "content": "<|reserved_special_token_236|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128245, + "content": "<|reserved_special_token_237|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128246, + "content": "<|reserved_special_token_238|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128247, + "content": "<|reserved_special_token_239|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128248, + "content": "<|reserved_special_token_240|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128249, + "content": "<|reserved_special_token_241|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128250, + "content": "<|reserved_special_token_242|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128251, + "content": "<|reserved_special_token_243|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128252, + "content": "<|reserved_special_token_244|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128253, + "content": "<|reserved_special_token_245|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128254, + "content": "<|reserved_special_token_246|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 128255, + "content": "<|reserved_special_token_247|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + ], + "normalizer": null, + "pre_tokenizer": { + "type": "Sequence", + "pretokenizers": [ + { + "type": "Split", + "pattern": { + "Regex": "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+" + }, + "behavior": "Isolated", + "invert": false + }, + { + "type": "ByteLevel", + "add_prefix_space": false, + "trim_offsets": true, + "use_regex": false + } + ] + }, + "post_processor": { + "type": "Sequence", + "processors": [ + { + "type": "ByteLevel", + "add_prefix_space": true, + "trim_offsets": false, + "use_regex": true + }, + { + "type": "TemplateProcessing", + "single": [ + { + "SpecialToken": { + "id": "<|begin_of_text|>", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + } + ], + "pair": [ + { + "SpecialToken": { + "id": "<|begin_of_text|>", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + }, + { + "SpecialToken": { + "id": "<|begin_of_text|>", + "type_id": 1 + } + }, + { + "Sequence": { + "id": "B", + "type_id": 1 + } + } + ], + "special_tokens": { + "<|begin_of_text|>": { + "id": "<|begin_of_text|>", + "ids": [ + 128000 + ], + "tokens": [ + "<|begin_of_text|>" + ] + } + } + } + ] + }, + "decoder": { + "type": "ByteLevel", + "add_prefix_space": true, + "trim_offsets": true, + "use_regex": true + }, + "model": { + "type": "BPE", + "dropout": null, + "unk_token": null, + "continuing_subword_prefix": null, + "end_of_word_suffix": null, + "fuse_unk": false, + "byte_fallback": false, + "ignore_merges": true, + "vocab": { + "!": 0, + "\"": 1, + "#": 2, + "$": 3, + "%": 4, + "&": 5, + "'": 6, + "(": 7, + ")": 8, + "*": 9, + "+": 10, + ",": 11, + "-": 12, + ".": 13, + "/": 14, + "0": 15, + "1": 16, + "2": 17, + "3": 18, + "4": 19, + "5": 20, + "6": 21, + "7": 22, + "8": 23, + "9": 24, + ":": 25, + ";": 26, + "<": 27, + "=": 28, + ">": 29, + "?": 30, + "@": 31, + "A": 32, + "B": 33, + "C": 34, + "D": 35, + "E": 36, + "F": 37, + "G": 38, + "H": 39, + "I": 40, + "J": 41, + "K": 42, + "L": 43, + "M": 44, + "N": 45, + "O": 46, + "P": 47, + "Q": 48, + "R": 49, + "S": 50, + "T": 51, + "U": 52, + "V": 53, + "W": 54, + "X": 55, + "Y": 56, + "Z": 57, + "[": 58, + "\\": 59, + "]": 60, + "^": 61, + "_": 62, + "`": 63, + "a": 64, + "b": 65, + "c": 66, + "d": 67, + "e": 68, + "f": 69, + "g": 70, + "h": 71, + "i": 72, + "j": 73, + "k": 74, + "l": 75, + "m": 76, + "n": 77, + "o": 78, + "p": 79, + "q": 80, + "r": 81, + "s": 82, + "t": 83, + "u": 84, + "v": 85, + "w": 86, + "x": 87, + "y": 88, + "z": 89, + "{": 90, + "|": 91, + "}": 92, + "~": 93, + "¡": 94, + "¢": 95, + "£": 96, + "¤": 97, + "¥": 98, + "¦": 99, + "§": 100, + "¨": 101, + "©": 102, + "ª": 103, + "«": 104, + "¬": 105, + "®": 106, + "¯": 107, + "°": 108, + "±": 109, + "²": 110, + "³": 111, + "´": 112, + "µ": 113, + "¶": 114, + "·": 115, + "¸": 116, + "¹": 117, + "º": 118, + "»": 119, + "¼": 120, + "½": 121, + "¾": 122, + "¿": 123, + "À": 124, + "Á": 125, + "Â": 126, + "Ã": 127, + "Ä": 128, + "Å": 129, + "Æ": 130, + "Ç": 131, + "È": 132, + "É": 133, + "Ê": 134, + "Ë": 135, + "Ì": 136, + "Í": 137, + "Î": 138, + "Ï": 139, + "Ð": 140, + "Ñ": 141, + "Ò": 142, + "Ó": 143, + "Ô": 144, + "Õ": 145, + "Ö": 146, + "×": 147, + "Ø": 148, + "Ù": 149, + "Ú": 150, + "Û": 151, + "Ü": 152, + "Ý": 153, + "Þ": 154, + "ß": 155, + "à": 156, + "á": 157, + "â": 158, + "ã": 159, + "ä": 160, + "å": 161, + "æ": 162, + "ç": 163, + "è": 164, + "é": 165, + "ê": 166, + "ë": 167, + "ì": 168, + "í": 169, + "î": 170, + "ï": 171, + "ð": 172, + "ñ": 173, + "ò": 174, + "ó": 175, + "ô": 176, + "õ": 177, + "ö": 178, + "÷": 179, + "ø": 180, + "ù": 181, + "ú": 182, + "û": 183, + "ü": 184, + "ý": 185, + "þ": 186, + "ÿ": 187, + "Ā": 188, + "ā": 189, + "Ă": 190, + "ă": 191, + "Ą": 192, + "ą": 193, + "Ć": 194, + "ć": 195, + "Ĉ": 196, + "ĉ": 197, + "Ċ": 198, + "ċ": 199, + "Č": 200, + "č": 201, + "Ď": 202, + "ď": 203, + "Đ": 204, + "đ": 205, + "Ē": 206, + "ē": 207, + "Ĕ": 208, + "ĕ": 209, + "Ė": 210, + "ė": 211, + "Ę": 212, + "ę": 213, + "Ě": 214, + "ě": 215, + "Ĝ": 216, + "ĝ": 217, + "Ğ": 218, + "ğ": 219, + "Ġ": 220, + "ġ": 221, + "Ģ": 222, + "ģ": 223, + "Ĥ": 224, + "ĥ": 225, + "Ħ": 226, + "ħ": 227, + "Ĩ": 228, + "ĩ": 229, + "Ī": 230, + "ī": 231, + "Ĭ": 232, + "ĭ": 233, + "Į": 234, + "į": 235, + "İ": 236, + "ı": 237, + "IJ": 238, + "ij": 239, + "Ĵ": 240, + "ĵ": 241, + "Ķ": 242, + "ķ": 243, + "ĸ": 244, + "Ĺ": 245, + "ĺ": 246, + "Ļ": 247, + "ļ": 248, + "Ľ": 249, + "ľ": 250, + "Ŀ": 251, + "ŀ": 252, + "Ł": 253, + "ł": 254, + "Ń": 255, + "ĠĠ": 256, + "ĠĠĠĠ": 257, + "in": 258, + "Ġt": 259, + "ĠĠĠĠĠĠĠĠ": 260, + "er": 261, + "ĠĠĠ": 262, + "on": 263, + "Ġa": 264, + "re": 265, + "at": 266, + "st": 267, + "en": 268, + "or": 269, + "Ġth": 270, + "ĊĊ": 271, + "Ġc": 272, + "le": 273, + "Ġs": 274, + "it": 275, + "an": 276, + "ar": 277, + "al": 278, + "Ġthe": 279, + ";Ċ": 280, + "Ġp": 281, + "Ġf": 282, + "ou": 283, + "Ġ=": 284, + "is": 285, + "ĠĠĠĠĠĠĠ": 286, + "ing": 287, + "es": 288, + "Ġw": 289, + "ion": 290, + "ed": 291, + "ic": 292, + "Ġb": 293, + "Ġd": 294, + "et": 295, + "Ġm": 296, + "Ġo": 297, + "ĉĉ": 298, + "ro": 299, + "as": 300, + "el": 301, + "ct": 302, + "nd": 303, + "Ġin": 304, + "Ġh": 305, + "ent": 306, + "id": 307, + "Ġn": 308, + "am": 309, + "ĠĠĠĠĠĠĠĠĠĠĠ": 310, + "Ġto": 311, + "Ġre": 312, + "--": 313, + "Ġ{": 314, + "Ġof": 315, + "om": 316, + ");Ċ": 317, + "im": 318, + "čĊ": 319, + "Ġ(": 320, + "il": 321, + "//": 322, + "Ġand": 323, + "ur": 324, + "se": 325, + "Ġl": 326, + "ex": 327, + "ĠS": 328, + "ad": 329, + "Ġ\"": 330, + "ch": 331, + "ut": 332, + "if": 333, + "**": 334, + "Ġ}": 335, + "em": 336, + "ol": 337, + "ĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠ": 338, + "th": 339, + ")Ċ": 340, + "Ġ{Ċ": 341, + "Ġg": 342, + "ig": 343, + "iv": 344, + ",Ċ": 345, + "ce": 346, + "od": 347, + "Ġv": 348, + "ate": 349, + "ĠT": 350, + "ag": 351, + "ay": 352, + "Ġ*": 353, + "ot": 354, + "us": 355, + "ĠC": 356, + "Ġst": 357, + "ĠI": 358, + "un": 359, + "ul": 360, + "ue": 361, + "ĠA": 362, + "ow": 363, + "Ġ'": 364, + "ew": 365, + "Ġ<": 366, + "ation": 367, + "()": 368, + "Ġfor": 369, + "ab": 370, + "ort": 371, + "um": 372, + "ame": 373, + "Ġis": 374, + "pe": 375, + "tr": 376, + "ck": 377, + "âĢ": 378, + "Ġy": 379, + "ist": 380, + "----": 381, + ".ĊĊ": 382, + "he": 383, + "Ġe": 384, + "lo": 385, + "ĠM": 386, + "Ġbe": 387, + "ers": 388, + "Ġon": 389, + "Ġcon": 390, + "ap": 391, + "ub": 392, + "ĠP": 393, + "ĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠ": 394, + "ass": 395, + "int": 396, + ">Ċ": 397, + "ly": 398, + "urn": 399, + "Ġ$": 400, + ";ĊĊ": 401, + "av": 402, + "port": 403, + "ir": 404, + "->": 405, + "nt": 406, + "ction": 407, + "end": 408, + "Ġde": 409, + "00": 410, + "ith": 411, + "out": 412, + "turn": 413, + "our": 414, + "ĠĠĠĠĠ": 415, + "lic": 416, + "res": 417, + "pt": 418, + "==": 419, + "Ġthis": 420, + "Ġwh": 421, + "Ġif": 422, + "ĠD": 423, + "ver": 424, + "age": 425, + "ĠB": 426, + "ht": 427, + "ext": 428, + "=\"": 429, + "Ġthat": 430, + "****": 431, + "ĠR": 432, + "Ġit": 433, + "ess": 434, + "ĠF": 435, + "Ġr": 436, + "os": 437, + "and": 438, + "Ġas": 439, + "ect": 440, + "ke": 441, + "rom": 442, + "Ġ//": 443, + "con": 444, + "ĠL": 445, + "(\"": 446, + "qu": 447, + "lass": 448, + "Ġwith": 449, + "iz": 450, + "de": 451, + "ĠN": 452, + "Ġal": 453, + "op": 454, + "up": 455, + "get": 456, + "Ġ}Ċ": 457, + "ile": 458, + "Ġan": 459, + "ata": 460, + "ore": 461, + "ri": 462, + "Ġpro": 463, + ";čĊ": 464, + "ĉĉĉĉ": 465, + "ter": 466, + "ain": 467, + "ĠW": 468, + "ĠE": 469, + "Ġcom": 470, + "Ġreturn": 471, + "art": 472, + "ĠH": 473, + "ack": 474, + "import": 475, + "ublic": 476, + "Ġor": 477, + "est": 478, + "ment": 479, + "ĠG": 480, + "able": 481, + "Ġ-": 482, + "ine": 483, + "ill": 484, + "ind": 485, + "ere": 486, + "::": 487, + "ity": 488, + "Ġ+": 489, + "Ġtr": 490, + "elf": 491, + "ight": 492, + "('": 493, + "orm": 494, + "ult": 495, + "str": 496, + "..": 497, + "\",": 498, + "Ġyou": 499, + "ype": 500, + "pl": 501, + "Ġnew": 502, + "Ġj": 503, + "ĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠ": 504, + "Ġfrom": 505, + "Ġex": 506, + "ĠO": 507, + "20": 508, + "ld": 509, + "Ġ[": 510, + "oc": 511, + ":Ċ": 512, + "Ġse": 513, + "Ġle": 514, + "--------": 515, + ".s": 516, + "{Ċ": 517, + "',": 518, + "ant": 519, + "Ġat": 520, + "ase": 521, + ".c": 522, + "Ġch": 523, + "": 524, + "ave": 525, + "ang": 526, + "Ġare": 527, + "Ġint": 528, + "âĢĻ": 529, + "_t": 530, + "ert": 531, + "ial": 532, + "act": 533, + "}Ċ": 534, + "ive": 535, + "ode": 536, + "ost": 537, + "Ġclass": 538, + "Ġnot": 539, + "og": 540, + "ord": 541, + "alue": 542, + "all": 543, + "ff": 544, + "();Ċ": 545, + "ont": 546, + "ime": 547, + "are": 548, + "ĠU": 549, + "Ġpr": 550, + "Ġ:": 551, + "ies": 552, + "ize": 553, + "ure": 554, + "Ġby": 555, + "ire": 556, + "Ġ}ĊĊ": 557, + ".p": 558, + "Ġsh": 559, + "ice": 560, + "ast": 561, + "ption": 562, + "tring": 563, + "ok": 564, + "__": 565, + "cl": 566, + "##": 567, + "Ġhe": 568, + "ard": 569, + ").": 570, + "Ġ@": 571, + "iew": 572, + "ĉĉĉ": 573, + "Ġwas": 574, + "ip": 575, + "this": 576, + "Ġu": 577, + "ĠThe": 578, + "ide": 579, + "ace": 580, + "ib": 581, + "ac": 582, + "rou": 583, + "Ġwe": 584, + "ject": 585, + "Ġpublic": 586, + "ak": 587, + "ve": 588, + "ath": 589, + "oid": 590, + "Ġ=>": 591, + "ust": 592, + "que": 593, + "Ġres": 594, + "))": 595, + "'s": 596, + "Ġk": 597, + "ans": 598, + "yst": 599, + "unction": 600, + "********": 601, + "Ġi": 602, + "Ġus": 603, + "pp": 604, + "10": 605, + "one": 606, + "ail": 607, + "====": 608, + "name": 609, + "Ġstr": 610, + "Ġ/": 611, + "Ġ&": 612, + "ach": 613, + "div": 614, + "ystem": 615, + "ell": 616, + "Ġhave": 617, + "err": 618, + "ould": 619, + "ull": 620, + "pon": 621, + "ĠJ": 622, + "_p": 623, + "Ġ==": 624, + "ign": 625, + "St": 626, + ".Ċ": 627, + "Ġpl": 628, + ");ĊĊ": 629, + "form": 630, + "put": 631, + "ount": 632, + "}ĊĊ": 633, + "dd": 634, + "ite": 635, + "Ġget": 636, + "rr": 637, + "ome": 638, + "ĠâĢ": 639, + "aram": 640, + "cc": 641, + "Ġ*/": 642, + "ER": 643, + "In": 644, + "les": 645, + "_s": 646, + "ong": 647, + "ie": 648, + "Ġcan": 649, + "ĠV": 650, + "erv": 651, + "pr": 652, + "Ġun": 653, + "row": 654, + "ber": 655, + "Ġdo": 656, + "ll": 657, + "Ġel": 658, + "Ġself": 659, + "ated": 660, + "ary": 661, + "Ġ.": 662, + "']": 663, + "ud": 664, + "Ġen": 665, + "ĠTh": 666, + "ĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠ": 667, + "te": 668, + "_c": 669, + "uct": 670, + "Ġab": 671, + "ork": 672, + ".get": 673, + "Ġ#": 674, + "aw": 675, + "ress": 676, + "ob": 677, + "Name": 678, + "201": 679, + "app": 680, + "['": 681, + "Ġall": 682, + "ory": 683, + "ition": 684, + "ance": 685, + "ear": 686, + "Ġcont": 687, + "vent": 688, + "ia": 689, + "Ġwill": 690, + "IN": 691, + "ĠĠĠĠĠĠĠĠĠ": 692, + "return": 693, + "Ġ": 694, + "data": 695, + ")ĊĊ": 696, + "Re": 697, + "ple": 698, + "ild": 699, + "ther": 700, + "Ġyour": 701, + "\"Ċ": 702, + "($": 703, + "Ġout": 704, + "),": 705, + "Ġhas": 706, + "String": 707, + "so": 708, + "Ġup": 709, + "ax": 710, + "Ġdef": 711, + "Ġbo": 712, + "ge": 713, + "alse": 714, + "ON": 715, + "per": 716, + "12": 717, + "ich": 718, + "Ġbut": 719, + "ĠĊ": 720, + "Ġ_": 721, + "_m": 722, + "add": 723, + "quest": 724, + "odel": 725, + "self": 726, + "ery": 727, + "ft": 728, + "ens": 729, + "////": 730, + "ake": 731, + ".C": 732, + "Ġgo": 733, + "Ġfunction": 734, + "ĠK": 735, + "ivate": 736, + "Ġim": 737, + "Ġconst": 738, + ".t": 739, + "Ġ*/Ċ": 740, + ");čĊ": 741, + "Ġvoid": 742, + "Ġset": 743, + "ĠSystem": 744, + "cri": 745, + "()Ċ": 746, + "li": 747, + "ĉif": 748, + ".m": 749, + "ally": 750, + "set": 751, + "ep": 752, + "âĢĻs": 753, + "bo": 754, + "def": 755, + "',Ċ": 756, + "Ġme": 757, + "Ġ!": 758, + "atch": 759, + "\">": 760, + "\",Ċ": 761, + "ec": 762, + "ĠIn": 763, + "ph": 764, + "Ġ|": 765, + "_f": 766, + "Ġvar": 767, + "ence": 768, + "Id": 769, + "ree": 770, + "ink": 771, + "lect": 772, + "ug": 773, + "eth": 774, + "Ġelse": 775, + "----------------": 776, + "19": 777, + "cont": 778, + "Ġso": 779, + "atic": 780, + "Ġlo": 781, + "pro": 782, + "ton": 783, + "ss": 784, + "own": 785, + "abel": 786, + "oint": 787, + "ous": 788, + "eld": 789, + "ST": 790, + "The": 791, + "ĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠ": 792, + "RE": 793, + "\":": 794, + "olor": 795, + "tp": 796, + "eg": 797, + "key": 798, + "ude": 799, + "ĠSt": 800, + "ound": 801, + "Ġar": 802, + "\");Ċ": 803, + "ener": 804, + "ser": 805, + "11": 806, + "bject": 807, + "essage": 808, + "fer": 809, + "Ġmore": 810, + "ations": 811, + "ents": 812, + "Ġhis": 813, + "Ġthey": 814, + ".S": 815, + "ĠY": 816, + "use": 817, + "ne": 818, + "ish": 819, + "old": 820, + "_d": 821, + "io": 822, + "ield": 823, + "Ġper": 824, + "Cont": 825, + "ings": 826, + "####": 827, + "Ġdata": 828, + "Ġsa": 829, + "ef": 830, + "fo": 831, + "Ġone": 832, + "eng": 833, + "Ġdis": 834, + "AT": 835, + "Ġname": 836, + "Ġtrue": 837, + "val": 838, + "led": 839, + ".f": 840, + "Ġne": 841, + "Ġend": 842, + "32": 843, + ".T": 844, + "16": 845, + "cre": 846, + "ark": 847, + "log": 848, + "Ex": 849, + "error": 850, + "_id": 851, + "urre": 852, + "ange": 853, + "Ġnull": 854, + "rray": 855, + "Ġmy": 856, + "pan": 857, + "ict": 858, + "ator": 859, + "View": 860, + "List": 861, + "ĉreturn": 862, + "âĢĿ": 863, + "Ġpre": 864, + "Ġx": 865, + "clude": 866, + "arg": 867, + "15": 868, + "ov": 869, + ".h": 870, + "Ġ>": 871, + "Ġtheir": 872, + "')": 873, + "irst": 874, + "ick": 875, + "gh": 876, + "LE": 877, + "OR": 878, + "Ġprivate": 879, + "tem": 880, + "čĊčĊ": 881, + "user": 882, + "Ġ)": 883, + "com": 884, + ".A": 885, + "\";Ċ": 886, + "Ġid": 887, + "read": 888, + "Ġwho": 889, + "_b": 890, + "\">Ċ": 891, + "Ġtime": 892, + "Ġman": 893, + "ry": 894, + "========": 895, + "roup": 896, + "rop": 897, + "public": 898, + "vel": 899, + "umber": 900, + "ble": 901, + "Ġwhich": 902, + "****************": 903, + "Ġany": 904, + "Ġfalse": 905, + "we": 906, + "Ġvalue": 907, + "Ġli": 908, + "\")": 909, + "nder": 910, + "gr": 911, + "Ġno": 912, + "param": 913, + "25": 914, + "fig": 915, + ".com": 916, + "Ġapp": 917, + "_l": 918, + "ions": 919, + ".D": 920, + "ĠCh": 921, + "Ġabout": 922, + "Ġadd": 923, + "Ġsu": 924, + "Ġstring": 925, + "ID": 926, + "Ġover": 927, + "string": 928, + ".l": 929, + "ource": 930, + "000": 931, + "_C": 932, + "]Ċ": 933, + "Ġqu": 934, + "ĠString": 935, + "ca": 936, + "SE": 937, + "Ġro": 938, + "sh": 939, + "ual": 940, + "Type": 941, + "son": 942, + "new": 943, + "ern": 944, + "Ġag": 945, + "AR": 946, + "];Ċ": 947, + "].": 948, + "Ġ?": 949, + "ical": 950, + "Ġdes": 951, + "uth": 952, + "ix": 953, + "ays": 954, + "Ġtype": 955, + "'t": 956, + "ault": 957, + "Ġinter": 958, + "var": 959, + ".b": 960, + "Ġpart": 961, + ".d": 962, + "urrent": 963, + "IT": 964, + "EN": 965, + "30": 966, + "enc": 967, + "(f": 968, + "ra": 969, + "value": 970, + "cho": 971, + "18": 972, + "utton": 973, + "ose": 974, + "14": 975, + "Ġ!=": 976, + "ater": 977, + "é": 978, + "reate": 979, + "oll": 980, + "pos": 981, + "yle": 982, + "ng": 983, + "AL": 984, + "using": 985, + "ames": 986, + "Ġ{čĊ": 987, + "ates": 988, + "ely": 989, + "Ġwork": 990, + "Ġem": 991, + "inal": 992, + "Ġsp": 993, + "Ġwhen": 994, + ".set": 995, + "ĠĠĠĠĠĠ": 996, + "):Ċ": 997, + "to": 998, + "quire": 999, + "indow": 1000, + "lement": 1001, + "pect": 1002, + "ash": 1003, + "[i": 1004, + "Ġuse": 1005, + ".F": 1006, + "pec": 1007, + "Ġad": 1008, + "ove": 1009, + "ception": 1010, + "ength": 1011, + "include": 1012, + "ader": 1013, + "ĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠ": 1014, + "atus": 1015, + "Th": 1016, + "itle": 1017, + "rit": 1018, + "void": 1019, + "().": 1020, + "(Ċ": 1021, + "Ġoff": 1022, + "Ġother": 1023, + "Ġ&&": 1024, + "';Ċ": 1025, + "ms": 1026, + "Ġbeen": 1027, + "Ġte": 1028, + "ml": 1029, + "co": 1030, + "nc": 1031, + "13": 1032, + "ervice": 1033, + "Ġ%": 1034, + "**Ċ": 1035, + "ann": 1036, + "ade": 1037, + "ĊĊĊĊ": 1038, + "lock": 1039, + "const": 1040, + "100": 1041, + "ponse": 1042, + "Ġsup": 1043, + "++": 1044, + "date": 1045, + "Ġacc": 1046, + "Ġhad": 1047, + "Ġbu": 1048, + "200": 1049, + "ĠRe": 1050, + "Ġwere": 1051, + "Ġfile": 1052, + "Ġwould": 1053, + "ĠâĢľ": 1054, + "ven": 1055, + "iss": 1056, + "Ġour": 1057, + "class": 1058, + "raw": 1059, + "Ġyear": 1060, + "Data": 1061, + "Ġval": 1062, + "Ġsome": 1063, + "fter": 1064, + "ys": 1065, + "Ġ///": 1066, + "round": 1067, + "view": 1068, + "Ġpe": 1069, + "Ġthere": 1070, + "Ġsaid": 1071, + "du": 1072, + "of": 1073, + "line": 1074, + "/*": 1075, + "duct": 1076, + "Ġher": 1077, + "ĠĠĠĠĠĠĠĠĠĠĠĠĠ": 1078, + "Res": 1079, + "Ġco": 1080, + "Ġcomm": 1081, + "ise": 1082, + "min": 1083, + "ĠĠĠĠĊ": 1084, + "#include": 1085, + "ethod": 1086, + ".P": 1087, + "ute": 1088, + "Ġass": 1089, + "Int": 1090, + "ask": 1091, + "loc": 1092, + "Ġlike": 1093, + "ody": 1094, + "Ġlet": 1095, + "load": 1096, + "Ġam": 1097, + "rol": 1098, + "Ġgr": 1099, + "yp": 1100, + "Ġalso": 1101, + "ĠIt": 1102, + "url": 1103, + "ific": 1104, + "ors": 1105, + "_P": 1106, + "_n": 1107, + "igh": 1108, + "Ġthan": 1109, + "Com": 1110, + "AN": 1111, + "UL": 1112, + "ating": 1113, + "17": 1114, + "ĠThis": 1115, + "ref": 1116, + "_S": 1117, + "Ġstatic": 1118, + "roll": 1119, + "Ġjust": 1120, + "Ġresult": 1121, + "ian": 1122, + "idth": 1123, + "Ġthem": 1124, + "));Ċ": 1125, + "der": 1126, + "reak": 1127, + "Con": 1128, + "://": 1129, + "ule": 1130, + "...": 1131, + "arch": 1132, + "ement": 1133, + "Ġ<<": 1134, + "50": 1135, + "ush": 1136, + "ense": 1137, + "arr": 1138, + "Ġinto": 1139, + "cess": 1140, + "amp": 1141, + "ied": 1142, + "ument": 1143, + "Ġ\\": 1144, + "],": 1145, + "wo": 1146, + "als": 1147, + "Ġwhat": 1148, + "anc": 1149, + "Value": 1150, + "='": 1151, + "olum": 1152, + "Ġpos": 1153, + "ages": 1154, + "ayer": 1155, + "Ġsc": 1156, + "ues": 1157, + "\")Ċ": 1158, + "_T": 1159, + "Ġlist": 1160, + "(s": 1161, + "Ġcase": 1162, + "Ch": 1163, + "ĉĉĉĉĉ": 1164, + "////////": 1165, + "ponent": 1166, + "Ġz": 1167, + "Ġkn": 1168, + "let": 1169, + "DE": 1170, + "red": 1171, + "Ġfe": 1172, + "Ġ},Ċ": 1173, + "Ġ,": 1174, + "(t": 1175, + "Ġfirst": 1176, + "');Ċ": 1177, + "word": 1178, + "Ġimport": 1179, + "Ġact": 1180, + "Ġchar": 1181, + "CT": 1182, + "ĠTr": 1183, + "ople": 1184, + "={": 1185, + "ĉf": 1186, + "24": 1187, + "ient": 1188, + "cent": 1189, + ".j": 1190, + "lection": 1191, + "))Ċ": 1192, + "Ġonly": 1193, + "Ġprint": 1194, + "mer": 1195, + ".W": 1196, + "ock": 1197, + "Ġ--": 1198, + "Text": 1199, + "Ġop": 1200, + "ank": 1201, + "Ġits": 1202, + "Ġback": 1203, + "[\"": 1204, + "Ġneed": 1205, + "Ġcl": 1206, + "Ġsub": 1207, + "Ġla": 1208, + "((": 1209, + ".\"": 1210, + "Object": 1211, + "Ġstart": 1212, + "file": 1213, + "(self": 1214, + "ner": 1215, + "ey": 1216, + "Ġuser": 1217, + "Ġent": 1218, + "ĠCom": 1219, + "its": 1220, + "ĠCon": 1221, + "ouble": 1222, + "ower": 1223, + "item": 1224, + "very": 1225, + "ĠWe": 1226, + "64": 1227, + "lick": 1228, + "ĠQ": 1229, + "php": 1230, + "ttp": 1231, + "':": 1232, + "ics": 1233, + "Ġunder": 1234, + "Ġ*Ċ": 1235, + ".L": 1236, + ");": 1237, + "ices": 1238, + "Ġreg": 1239, + ")čĊ": 1240, + "ĉpublic": 1241, + "SS": 1242, + "Ġthen": 1243, + "reat": 1244, + "ious": 1245, + ".G": 1246, + "ek": 1247, + "irect": 1248, + "heck": 1249, + "cript": 1250, + "ning": 1251, + "ĠUn": 1252, + "Ġmay": 1253, + "ĠWh": 1254, + "Bo": 1255, + "Item": 1256, + "struct": 1257, + ".st": 1258, + "ream": 1259, + "ible": 1260, + "loat": 1261, + "Ġorg": 1262, + "und": 1263, + "sum": 1264, + "_in": 1265, + "../": 1266, + "_M": 1267, + "Ġhow": 1268, + "rite": 1269, + "'Ċ": 1270, + "To": 1271, + "40": 1272, + "ww": 1273, + "Ġpeople": 1274, + "index": 1275, + ".n": 1276, + "http": 1277, + "(m": 1278, + "ector": 1279, + "Ġind": 1280, + "Ġjav": 1281, + "],Ċ": 1282, + "ĠHe": 1283, + "_st": 1284, + "ful": 1285, + "ole": 1286, + "){Ċ": 1287, + "Ġshould": 1288, + "opy": 1289, + "elp": 1290, + "ier": 1291, + "_name": 1292, + "erson": 1293, + "ION": 1294, + "ote": 1295, + "Ġtest": 1296, + "Ġbet": 1297, + "rror": 1298, + "ular": 1299, + "ãĢ": 1300, + "ĠÐ": 1301, + "bs": 1302, + "ting": 1303, + "Ġmake": 1304, + "Tr": 1305, + "Ġafter": 1306, + "arget": 1307, + "RO": 1308, + "olumn": 1309, + "rc": 1310, + "_re": 1311, + "define": 1312, + "22": 1313, + "Ġright": 1314, + "right": 1315, + "day": 1316, + "Ġlong": 1317, + "[]": 1318, + "(p": 1319, + "td": 1320, + "cond": 1321, + "ĠPro": 1322, + "Ġrem": 1323, + "ptions": 1324, + "vid": 1325, + ".g": 1326, + "Ġext": 1327, + "Ġ__": 1328, + "')Ċ": 1329, + "pace": 1330, + "mp": 1331, + "Ġmin": 1332, + "stance": 1333, + "air": 1334, + "action": 1335, + "wh": 1336, + "type": 1337, + "util": 1338, + "ait": 1339, + "": 1340, + "IC": 1341, + "text": 1342, + "Ġph": 1343, + "Ġfl": 1344, + ".M": 1345, + "ccess": 1346, + "br": 1347, + "fore": 1348, + "ersion": 1349, + "),Ċ": 1350, + ".re": 1351, + "ateg": 1352, + "Ġloc": 1353, + "ins": 1354, + "-s": 1355, + "trib": 1356, + "ĠInt": 1357, + "Ġarray": 1358, + ",\"": 1359, + "Pro": 1360, + "(c": 1361, + "ession": 1362, + ">ĊĊ": 1363, + "Ġshe": 1364, + "\"]": 1365, + "aph": 1366, + "Ġexp": 1367, + "erty": 1368, + "ĠSe": 1369, + "Ġpar": 1370, + "unc": 1371, + "ET": 1372, + "Ġread": 1373, + "print": 1374, + "Ġrel": 1375, + "Ġform": 1376, + "Ġdr": 1377, + "Exception": 1378, + "input": 1379, + "Ġtrans": 1380, + "########": 1381, + "order": 1382, + "By": 1383, + "Ġaw": 1384, + "ities": 1385, + "uff": 1386, + "play": 1387, + ".add": 1388, + "ĠâĢĵ": 1389, + "Ġwant": 1390, + "Ġcomp": 1391, + "ments": 1392, + "Ġ||": 1393, + "az": 1394, + "be": 1395, + "Ġnumber": 1396, + "Ġrequire": 1397, + "ĠEx": 1398, + "60": 1399, + "Ġcol": 1400, + "Ġkey": 1401, + "ember": 1402, + "Ġtwo": 1403, + "Ġsize": 1404, + "Ġwhere": 1405, + "UT": 1406, + "result": 1407, + "ĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠ": 1408, + "ough": 1409, + "orld": 1410, + "ood": 1411, + "uch": 1412, + "ative": 1413, + "ger": 1414, + "arent": 1415, + "Ġ/*": 1416, + "Ġarg": 1417, + "Ġwhile": 1418, + "23": 1419, + "(this": 1420, + "Ġrec": 1421, + "Ġdif": 1422, + "State": 1423, + "Ġspec": 1424, + "ride": 1425, + "_F": 1426, + "Ġlook": 1427, + "AM": 1428, + "ility": 1429, + "eter": 1430, + "âĢĻt": 1431, + "ĊĊĊ": 1432, + "ayout": 1433, + "--------------------------------": 1434, + "ager": 1435, + "Ġcould": 1436, + "Ġbr": 1437, + "ends": 1438, + "ures": 1439, + "Ġknow": 1440, + "ets": 1441, + "ĠIf": 1442, + "ĠSh": 1443, + ".w": 1444, + "back": 1445, + "Ġser": 1446, + "Ġ+=": 1447, + "Ġfr": 1448, + "());Ċ": 1449, + "Ġhand": 1450, + "Ind": 1451, + "ULL": 1452, + "Im": 1453, + "();ĊĊ": 1454, + "Ġmost": 1455, + "Ġtry": 1456, + "Ġnow": 1457, + "rough": 1458, + ">čĊ": 1459, + "ackage": 1460, + "Ġhim": 1461, + "._": 1462, + "ify": 1463, + "Ġbreak": 1464, + "Ġ);Ċ": 1465, + "ren": 1466, + "#define": 1467, + "itt": 1468, + "Ġap": 1469, + "ĉc": 1470, + "(n": 1471, + "ĠYou": 1472, + ":ĊĊ": 1473, + "-m": 1474, + "Ġevery": 1475, + "ustom": 1476, + "lient": 1477, + "ocument": 1478, + "cription": 1479, + "Error": 1480, + "-b": 1481, + "о": 1482, + "][": 1483, + "99": 1484, + "trans": 1485, + "Ġpoint": 1486, + "Ġstd": 1487, + "Ġfil": 1488, + "Time": 1489, + "80": 1490, + "Ġmod": 1491, + "Ġ->": 1492, + "Ġerror": 1493, + "ah": 1494, + "Ġtext": 1495, + "roller": 1496, + "lose": 1497, + "ql": 1498, + "Ġpol": 1499, + ">": 1500, + "Ġshow": 1501, + "User": 1502, + "ased": 1503, + "Ġ{ĊĊ": 1504, + "Ġfind": 1505, + "а": 1506, + "ED": 1507, + "span": 1508, + "enu": 1509, + "Ġcurrent": 1510, + "Ġused": 1511, + "cept": 1512, + "clud": 1513, + "Ġplay": 1514, + "Ġlog": 1515, + "ution": 1516, + "fl": 1517, + "Ġsee": 1518, + "indows": 1519, + "Ġhelp": 1520, + "Ġthese": 1521, + "Ġpass": 1522, + "Ġdown": 1523, + "Ġeven": 1524, + "ason": 1525, + "uild": 1526, + "from": 1527, + "(d": 1528, + "Ġbl": 1529, + "label": 1530, + "else": 1531, + "е": 1532, + "Ġ(!": 1533, + "ized": 1534, + "(),": 1535, + "Ġob": 1536, + "Ġitem": 1537, + "ump": 1538, + "UR": 1539, + "orn": 1540, + "Ġdon": 1541, + "Se": 1542, + "man": 1543, + "27": 1544, + "ample": 1545, + "tn": 1546, + "================": 1547, + "He": 1548, + "gram": 1549, + "Ġdid": 1550, + "wn": 1551, + "_h": 1552, + "iver": 1553, + "Ġsm": 1554, + "Ġthrough": 1555, + "ĠAn": 1556, + "che": 1557, + "Ġinv": 1558, + "ouse": 1559, + "Ġes": 1560, + "ĠNew": 1561, + "export": 1562, + "mary": 1563, + "uto": 1564, + "ler": 1565, + "Ġlast": 1566, + "Ġevent": 1567, + "try": 1568, + "ï¼": 1569, + "ily": 1570, + "igned": 1571, + "ines": 1572, + "ollow": 1573, + "icense": 1574, + "sole": 1575, + "lear": 1576, + "(int": 1577, + "Ġagain": 1578, + "Ġhigh": 1579, + "html": 1580, + "Index": 1581, + "uthor": 1582, + "Ġ/**Ċ": 1583, + "Ġline": 1584, + "Event": 1585, + "_D": 1586, + "Ġdoes": 1587, + "itial": 1588, + "Ġcr": 1589, + "ars": 1590, + "28": 1591, + "Ġtem": 1592, + "cause": 1593, + "face": 1594, + "Ġ`": 1595, + "_A": 1596, + "Button": 1597, + "ature": 1598, + "ected": 1599, + "ES": 1600, + "ister": 1601, + "ĉĊ": 1602, + "Ġbefore": 1603, + "ale": 1604, + "other": 1605, + "Ġbecause": 1606, + "roid": 1607, + "Ġed": 1608, + "ik": 1609, + "reg": 1610, + "ĠDe": 1611, + "Ġdist": 1612, + "},Ċ": 1613, + "Ġstate": 1614, + "Ġcons": 1615, + "rint": 1616, + "att": 1617, + "Ġhere": 1618, + "ined": 1619, + "Ġfinal": 1620, + "Ġ\"\"": 1621, + "Key": 1622, + "LO": 1623, + "Ġdel": 1624, + "pty": 1625, + "thing": 1626, + "26": 1627, + "ĠAnd": 1628, + "Ġrun": 1629, + "ĠX": 1630, + "ym": 1631, + ".app": 1632, + "Ġvery": 1633, + "ces": 1634, + "_N": 1635, + "ared": 1636, + "ward": 1637, + "list": 1638, + "ited": 1639, + "olog": 1640, + "itch": 1641, + "Box": 1642, + "ife": 1643, + "33": 1644, + "Ġac": 1645, + "Ġmodel": 1646, + "Ġmon": 1647, + "Ġway": 1648, + "lete": 1649, + "Ġcall": 1650, + "Ġatt": 1651, + "Ġcal": 1652, + "vert": 1653, + "Ġdec": 1654, + "lease": 1655, + "oun": 1656, + "Ġ});Ċ": 1657, + "fr": 1658, + "formation": 1659, + "etail": 1660, + "Ġnum": 1661, + "aj": 1662, + "query": 1663, + "Ġwell": 1664, + "Ġobject": 1665, + "ĠAs": 1666, + "Ġyears": 1667, + "Color": 1668, + "IS": 1669, + "Ġdefault": 1670, + "Wh": 1671, + "Ġins": 1672, + "aint": 1673, + "Ġjava": 1674, + "Ġsim": 1675, + "ĠAr": 1676, + "mon": 1677, + "til": 1678, + "();čĊ": 1679, + "):": 1680, + "Set": 1681, + "29": 1682, + "atter": 1683, + "Ġview": 1684, + "Ġpres": 1685, + "array": 1686, + "We": 1687, + "At": 1688, + "Ġbel": 1689, + "Ġmany": 1690, + "21": 1691, + "Man": 1692, + "ender": 1693, + "Ġbeing": 1694, + "Ġgood": 1695, + "ĉĉĉĉĉĉ": 1696, + "ational": 1697, + "ware": 1698, + ".log": 1699, + "{čĊ": 1700, + "Ġusing": 1701, + "_B": 1702, + "Ġ:=": 1703, + "_w": 1704, + "ists": 1705, + "lish": 1706, + "Ġstud": 1707, + "ĠAl": 1708, + "Ġgu": 1709, + "config": 1710, + "uring": 1711, + "time": 1712, + "oken": 1713, + "amespace": 1714, + "Ġrequest": 1715, + "Ġchild": 1716, + "ĠÃ": 1717, + "lob": 1718, + "Ġparam": 1719, + "Ġ}čĊ": 1720, + "01": 1721, + "Ġecho": 1722, + "function": 1723, + "********************************": 1724, + "ps": 1725, + "Element": 1726, + "alk": 1727, + "lication": 1728, + "by": 1729, + "Size": 1730, + "rawing": 1731, + "Ġperson": 1732, + "ĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠ": 1733, + "\\n": 1734, + "object": 1735, + "ince": 1736, + "En": 1737, + "File": 1738, + "uf": 1739, + "ffect": 1740, + "AC": 1741, + "Ġstyle": 1742, + "summary": 1743, + "Ġque": 1744, + "_r": 1745, + "Ġ($": 1746, + "Model": 1747, + "ident": 1748, + "Ġmethod": 1749, + "IL": 1750, + "ott": 1751, + "less": 1752, + "ING": 1753, + "Ġ()": 1754, + "Ġexpect": 1755, + "ync": 1756, + "package": 1757, + "35": 1758, + "urs": 1759, + "Ġprot": 1760, + "./": 1761, + "pre": 1762, + "Ġ)Ċ": 1763, + "ma": 1764, + "Ġsur": 1765, + "Ġfound": 1766, + "Info": 1767, + "par": 1768, + "imes": 1769, + ".e": 1770, + "ains": 1771, + "Ġpost": 1772, + "-d": 1773, + "45": 1774, + "olean": 1775, + "Ġsl": 1776, + "PE": 1777, + "Ġsuch": 1778, + "select": 1779, + "ainer": 1780, + "Ġthink": 1781, + "Ġdiffer": 1782, + ".r": 1783, + "/**Ċ": 1784, + "FF": 1785, + "ool": 1786, + "plate": 1787, + "qual": 1788, + "ĠFor": 1789, + "Ġmuch": 1790, + "uc": 1791, + "(new": 1792, + "odule": 1793, + "Ġsom": 1794, + "Ġhttp": 1795, + "ĠList": 1796, + "Ġcount": 1797, + "Ġinst": 1798, + "char": 1799, + "mit": 1800, + ".id": 1801, + "aking": 1802, + "Ġgener": 1803, + "px": 1804, + "vice": 1805, + "37": 1806, + "_data": 1807, + "ĠNULL": 1808, + "}čĊ": 1809, + "idd": 1810, + "ãĢĤ": 1811, + "Ġmed": 1812, + "org": 1813, + "ider": 1814, + "ache": 1815, + "work": 1816, + "Ġcheck": 1817, + "ween": 1818, + "Ġ((": 1819, + "the": 1820, + "ants": 1821, + "><": 1822, + ".B": 1823, + "-c": 1824, + "Ġopen": 1825, + "Ġest": 1826, + "ĠĠĠĠĠĠĠĠĊ": 1827, + "Ġnext": 1828, + "IM": 1829, + "ÑĤ": 1830, + "OT": 1831, + "ó": 1832, + "Ġfollow": 1833, + "content": 1834, + "ĠĠĠĠĠĠĠĠĠĠĠĠ": 1835, + "Ġinclud": 1836, + "HE": 1837, + "ĠRes": 1838, + "Ġhref": 1839, + "и": 1840, + "Ġcar": 1841, + "ypes": 1842, + "image": 1843, + "Un": 1844, + "Ġbool": 1845, + "AD": 1846, + "Ġgame": 1847, + ".Form": 1848, + "rows": 1849, + "*/": 1850, + "velop": 1851, + ".Drawing": 1852, + "Ġpath": 1853, + "ision": 1854, + "Ġeach": 1855, + "ĠPl": 1856, + "_type": 1857, + "Path": 1858, + "nection": 1859, + "Ġav": 1860, + "').": 1861, + "Ġsupport": 1862, + "ENT": 1863, + "rem": 1864, + "\").": 1865, + "Ġown": 1866, + "Ġcor": 1867, + "count": 1868, + "miss": 1869, + "ually": 1870, + "Ġmem": 1871, + "std": 1872, + "ience": 1873, + "search": 1874, + "\"ĊĊ": 1875, + "Form": 1876, + "Ġsex": 1877, + "ename": 1878, + "Ġsign": 1879, + "Ġet": 1880, + "ĠĠĠĠĠĠĠĠĠĠ": 1881, + "','": 1882, + "ĠApp": 1883, + "Ġthose": 1884, + "off": 1885, + "Ġerr": 1886, + "Ġsystem": 1887, + "Ġbest": 1888, + "code": 1889, + "Ġsame": 1890, + "Ġdi": 1891, + "uss": 1892, + "Ġcreate": 1893, + "ather": 1894, + "Array": 1895, + ".in": 1896, + "fe": 1897, + "Service": 1898, + "UN": 1899, + "ats": 1900, + "ĠZ": 1901, + "alth": 1902, + "Ġmade": 1903, + "true": 1904, + "AB": 1905, + "Ġmark": 1906, + "rid": 1907, + "ified": 1908, + ",čĊ": 1909, + "yn": 1910, + "press": 1911, + "Ġgroup": 1912, + "Ġfin": 1913, + "ĠLicense": 1914, + "Field": 1915, + "eger": 1916, + "Ġworld": 1917, + "iness": 1918, + "ty": 1919, + "Ġprocess": 1920, + "(b": 1921, + "Ġcre": 1922, + "arn": 1923, + "ives": 1924, + "Ġmain": 1925, + "ideo": 1926, + "36": 1927, + "_g": 1928, + "AG": 1929, + "valid": 1930, + "img": 1931, + "PI": 1932, + "Ġcolor": 1933, + "Ġreport": 1934, + "Ġtake": 1935, + "rib": 1936, + "OM": 1937, + "Ġday": 1938, + "Request": 1939, + "Ġsk": 1940, + "bers": 1941, + "ĉs": 1942, + ".Add": 1943, + "oot": 1944, + "Image": 1945, + "Ġcomple": 1946, + "ollection": 1947, + "Ġtop": 1948, + "Ġfree": 1949, + "AS": 1950, + "De": 1951, + "ĠOn": 1952, + "IG": 1953, + "90": 1954, + "eta": 1955, + "Date": 1956, + "Ġaction": 1957, + "34": 1958, + "Over": 1959, + "itor": 1960, + "ĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠ": 1961, + "not": 1962, + "Ġindex": 1963, + "her": 1964, + "icon": 1965, + "On": 1966, + ";čĊčĊ": 1967, + "ivity": 1968, + "mand": 1969, + ".Windows": 1970, + "OL": 1971, + "Ġreal": 1972, + "Ġmax": 1973, + "land": 1974, + "....": 1975, + "raph": 1976, + "Ġbuild": 1977, + "leg": 1978, + "assword": 1979, + "?ĊĊ": 1980, + "â̦": 1981, + "ook": 1982, + "uck": 1983, + "Ġmessage": 1984, + "test": 1985, + "ivers": 1986, + "38": 1987, + "Ġinput": 1988, + "Ġart": 1989, + "Ġbetween": 1990, + "Get": 1991, + "enter": 1992, + "ground": 1993, + "ene": 1994, + "á": 1995, + ".length": 1996, + "Node": 1997, + "(i": 1998, + "Class": 1999, + "for": 2000, + "ĠâĢĶ": 2001, + "ten": 2002, + "oin": 2003, + "Ġke": 2004, + "ui": 2005, + "ĠIN": 2006, + "Ġtable": 2007, + "sub": 2008, + "ĠLe": 2009, + "Ġhead": 2010, + "Ġmust": 2011, + "////////////////": 2012, + ".util": 2013, + "Context": 2014, + "Ġorder": 2015, + "Ġmov": 2016, + "over": 2017, + "Ġcontin": 2018, + "Ġsay": 2019, + "static": 2020, + ".Text": 2021, + "ĠclassName": 2022, + "pany": 2023, + "Ġter": 2024, + "head": 2025, + "rg": 2026, + "Ġproduct": 2027, + "This": 2028, + ".âĢĿ": 2029, + "ĠBut": 2030, + "70": 2031, + "loy": 2032, + "Ġdouble": 2033, + "sg": 2034, + "Ġplace": 2035, + ".x": 2036, + "message": 2037, + "Ġinformation": 2038, + "private": 2039, + "Ġoper": 2040, + "ced": 2041, + "db": 2042, + "\">": 2043, + "Param": 2044, + "icle": 2045, + "Ġweek": 2046, + "Ġprop": 2047, + "table": 2048, + "idget": 2049, + "place": 2050, + "Prop": 2051, + "ĠAll": 2052, + "els": 2053, + "box": 2054, + ".ĊĊĊĊ": 2055, + ".R": 2056, + "ĠTo": 2057, + "iter": 2058, + "Sh": 2059, + "uration": 2060, + "older": 2061, + "_list": 2062, + "come": 2063, + "Ġsw": 2064, + "ization": 2065, + "ĉfor": 2066, + "bl": 2067, + "Ġprogram": 2068, + "(e": 2069, + "ape": 2070, + "check": 2071, + ".Forms": 2072, + "Ġund": 2073, + "ategory": 2074, + "75": 2075, + "ags": 2076, + "Ġresponse": 2077, + "US": 2078, + "request": 2079, + "Ġstruct": 2080, + "escription": 2081, + "Ġcode": 2082, + "_H": 2083, + "uffer": 2084, + "Ġwithout": 2085, + "lobal": 2086, + "Manager": 2087, + "ilter": 2088, + "PO": 2089, + "ĉthis": 2090, + "option": 2091, + "Ġsol": 2092, + "Ġ===": 2093, + "akes": 2094, + "Controller": 2095, + "44": 2096, + "Message": 2097, + "Ġref": 2098, + "ever": 2099, + "ĠSo": 2100, + "aining": 2101, + ".append": 2102, + "Ġstill": 2103, + "Ġprovid": 2104, + "Ġassert": 2105, + "med": 2106, + "Ġcap": 2107, + "usiness": 2108, + "Ġrep": 2109, + "tings": 2110, + "ved": 2111, + ".N": 2112, + "api": 2113, + "OD": 2114, + "Ġfield": 2115, + "iven": 2116, + "oto": 2117, + "âĢľ": 2118, + "col": 2119, + "(x": 2120, + "ght": 2121, + "Result": 2122, + "Code": 2123, + ".is": 2124, + "link": 2125, + "Ġcour": 2126, + "An": 2127, + "Ġteam": 2128, + "ĉint": 2129, + "ift": 2130, + "55": 2131, + "Ġsecond": 2132, + "Ġgoing": 2133, + "Ġrange": 2134, + "_E": 2135, + "ness": 2136, + "39": 2137, + "Ġfam": 2138, + "Ġnil": 2139, + "ĠCont": 2140, + "ailable": 2141, + "utes": 2142, + "atab": 2143, + "Ġfact": 2144, + "Ġvis": 2145, + "(&": 2146, + "ĠAN": 2147, + "31": 2148, + "Al": 2149, + "title": 2150, + "Ġandroid": 2151, + "CE": 2152, + "\\\"": 2153, + "irt": 2154, + "Ġwrit": 2155, + "н": 2156, + "ĉm": 2157, + "ftware": 2158, + "ond": 2159, + "Ġret": 2160, + "osition": 2161, + "Ġhome": 2162, + "Ġleft": 2163, + "args": 2164, + "meric": 2165, + "48": 2166, + "Ġdirect": 2167, + "oci": 2168, + "Pl": 2169, + "As": 2170, + "ret": 2171, + "ado": 2172, + "Of": 2173, + "chn": 2174, + "ĠGet": 2175, + "ee": 2176, + "ross": 2177, + "();": 2178, + "____": 2179, + ".ph": 2180, + "It": 2181, + "oute": 2182, + "Ġexper": 2183, + "chool": 2184, + "www": 2185, + "},": 2186, + "Ġallow": 2187, + "ĠÂ": 2188, + "())": 2189, + "size": 2190, + "ism": 2191, + "ai": 2192, + "tract": 2193, + "ane": 2194, + "...ĊĊ": 2195, + "context": 2196, + "Ġbeg": 2197, + "CH": 2198, + "Ġpage": 2199, + "hip": 2200, + "no": 2201, + "core": 2202, + "sp": 2203, + "Ġdifferent": 2204, + "iable": 2205, + "ĠMe": 2206, + "_IN": 2207, + "button": 2208, + "ĠIs": 2209, + "ervices": 2210, + "Ġca": 2211, + "Ġaround": 2212, + "App": 2213, + "ration": 2214, + "Ġrece": 2215, + "Ġreally": 2216, + "Ġimage": 2217, + "Ġtarget": 2218, + "Ġdep": 2219, + "opyright": 2220, + "tra": 2221, + "ingle": 2222, + "ital": 2223, + "Layout": 2224, + "Ġboth": 2225, + "Override": 2226, + "arm": 2227, + "=>": 2228, + "aterial": 2229, + "iled": 2230, + "Ġput": 2231, + "Qu": 2232, + "ÑĢ": 2233, + "ung": 2234, + "map": 2235, + "ĉĉĉĉĉĉĉĉ": 2236, + "Ġlevel": 2237, + "Component": 2238, + "book": 2239, + "creen": 2240, + "_RE": 2241, + "Ġconfig": 2242, + "ãģ": 2243, + "Or": 2244, + ".data": 2245, + "Ġdocument": 2246, + "\",\"": 2247, + "tribute": 2248, + "ux": 2249, + "Log": 2250, + "ference": 2251, + "post": 2252, + "_e": 2253, + "Ġlocal": 2254, + "andom": 2255, + "assert": 2256, + "Val": 2257, + "lected": 2258, + "ina": 2259, + "atabase": 2260, + "Add": 2261, + "Ġcontent": 2262, + ".print": 2263, + "signed": 2264, + "ric": 2265, + ".\"ĊĊ": 2266, + "Ġfa": 2267, + "!ĊĊ": 2268, + "-f": 2269, + "ived": 2270, + "Ġquest": 2271, + ".ex": 2272, + "Ġfloat": 2273, + "Ġdevelop": 2274, + "оÐ": 2275, + "Map": 2276, + "ading": 2277, + "Ġposs": 2278, + "UE": 2279, + "namespace": 2280, + "_O": 2281, + "ĉb": 2282, + ".Get": 2283, + ">(": 2284, + "json": 2285, + "etails": 2286, + "66": 2287, + "Ġtoo": 2288, + "Ġextends": 2289, + "ĠNone": 2290, + "Ġfore": 2291, + "(String": 2292, + "format": 2293, + "Ġgreat": 2294, + "inter": 2295, + "cale": 2296, + "Ñģ": 2297, + "ron": 2298, + "iving": 2299, + "Ent": 2300, + "ency": 2301, + "xt": 2302, + "oy": 2303, + "05": 2304, + "Ġmonth": 2305, + "Ġhapp": 2306, + "Ġsuper": 2307, + "bar": 2308, + "default": 2309, + "_de": 2310, + "ords": 2311, + "ln": 2312, + "({Ċ": 2313, + "ĠInd": 2314, + "ases": 2315, + "Ġtitle": 2316, + "Ġcontext": 2317, + "08": 2318, + "oh": 2319, + "-p": 2320, + "Em": 2321, + "Ġmet": 2322, + "Test": 2323, + "Ġlife": 2324, + "_v": 2325, + "ĠUS": 2326, + "UI": 2327, + "ocation": 2328, + "md": 2329, + "Ġ[Ċ": 2330, + "Ġ]": 2331, + "sw": 2332, + "Ġincre": 2333, + "script": 2334, + "ential": 2335, + "ways": 2336, + ".de": 2337, + "Ġsrc": 2338, + "Ġcatch": 2339, + "ĠAmeric": 2340, + "//Ċ": 2341, + "ĠĠĠĠĠĠĠĠĠĠĠĠĠĠ": 2342, + "Ġpay": 2343, + "plit": 2344, + "âĢĶ": 2345, + "Ġcoun": 2346, + "obj": 2347, + ".php": 2348, + "Ġchange": 2349, + "ething": 2350, + "'re": 2351, + "aster": 2352, + "los": 2353, + "lation": 2354, + "ĠĠĊ": 2355, + "Le": 2356, + "ä": 2357, + "({": 2358, + "ready": 2359, + "ĠNo": 2360, + "Ġposition": 2361, + "Ġold": 2362, + "Ġbook": 2363, + "abled": 2364, + "bug": 2365, + "202": 2366, + "Hand": 2367, + "};ĊĊ": 2368, + "isplay": 2369, + "aving": 2370, + "04": 2371, + "Ġgover": 2372, + "Ġversion": 2373, + "System": 2374, + "nect": 2375, + "response": 2376, + "Style": 2377, + "Up": 2378, + "angu": 2379, + "Ġthree": 2380, + "init": 2381, + "ero": 2382, + "Ġlaw": 2383, + "endif": 2384, + "Ġbase": 2385, + "email": 2386, + "(l": 2387, + "_V": 2388, + "Ġconf": 2389, + "ATE": 2390, + "Ġduring": 2391, + "tes": 2392, + "Ġconsole": 2393, + "ĠPr": 2394, + "Ġspe": 2395, + "ves": 2396, + "65": 2397, + "path": 2398, + "ialog": 2399, + "dition": 2400, + "_to": 2401, + "ards": 2402, + "Ġagainst": 2403, + "etwork": 2404, + "ĠPh": 2405, + "_L": 2406, + "cur": 2407, + "imit": 2408, + "With": 2409, + "Ġpower": 2410, + "ium": 2411, + "';ĊĊ": 2412, + "Ġwom": 2413, + "left": 2414, + "ources": 2415, + "atri": 2416, + "ĠIm": 2417, + "ĠMan": 2418, + "orth": 2419, + "${": 2420, + "88": 2421, + "quals": 2422, + "ese": 2423, + "_size": 2424, + "Ġiss": 2425, + "otal": 2426, + "-g": 2427, + "ique": 2428, + "rame": 2429, + "Ġwidth": 2430, + "erg": 2431, + ")(": 2432, + "ittle": 2433, + "TR": 2434, + "ĠThey": 2435, + "ences": 2436, + "02": 2437, + "rl": 2438, + "ons": 2439, + "Ġlabel": 2440, + ".y": 2441, + "-t": 2442, + "update": 2443, + "anel": 2444, + "sc": 2445, + ".to": 2446, + "Ġproject": 2447, + "ü": 2448, + "Ġelement": 2449, + "Ġsuccess": 2450, + "ĉĉĊ": 2451, + ".sh": 2452, + "ram": 2453, + "ched": 2454, + "())Ċ": 2455, + "Ġ(Ċ": 2456, + "Ġdate": 2457, + "Ġtot": 2458, + "_ST": 2459, + "All": 2460, + "ification": 2461, + "ĉvar": 2462, + "Ġtri": 2463, + "chem": 2464, + "my": 2465, + "Ġbig": 2466, + "ĠAd": 2467, + "ĠAt": 2468, + "ots": 2469, + "num": 2470, + "Act": 2471, + "Ġmap": 2472, + "era": 2473, + "cope": 2474, + ".$": 2475, + ",âĢĿ": 2476, + "Ġpop": 2477, + "Ġfew": 2478, + "Ġlen": 2479, + "uid": 2480, + "eters": 2481, + "ules": 2482, + "ÃŃ": 2483, + "source": 2484, + "https": 2485, + "Ġdem": 2486, + "Ġear": 2487, + "################": 2488, + "Ġmatch": 2489, + "ories": 2490, + "49": 2491, + "aces": 2492, + "ĠCl": 2493, + "Ġnode": 2494, + "78": 2495, + "irc": 2496, + "local": 2497, + "unity": 2498, + "};Ċ": 2499, + "Ġanother": 2500, + "<<": 2501, + "ogle": 2502, + "Ġsit": 2503, + "ework": 2504, + "TE": 2505, + ".I": 2506, + "NS": 2507, + "ology": 2508, + "ought": 2509, + ".Cont": 2510, + ">>": 2511, + "Ġcare": 2512, + "state": 2513, + "ĉprivate": 2514, + "Ġeffect": 2515, + "++)": 2516, + "_file": 2517, + "ending": 2518, + "Line": 2519, + "For": 2520, + "ior": 2521, + "ĠSc": 2522, + "Ġfun": 2523, + ".Size": 2524, + "ĉelse": 2525, + "])": 2526, + "start": 2527, + "vious": 2528, + "Ġ},": 2529, + "ours": 2530, + "Ġleg": 2531, + "Ġservice": 2532, + "Ġsince": 2533, + "iron": 2534, + "Label": 2535, + "Ġnon": 2536, + "Ġlos": 2537, + "iction": 2538, + "Ġfull": 2539, + "acter": 2540, + "board": 2541, + "gress": 2542, + "Ġturn": 2543, + "ither": 2544, + "09": 2545, + ".size": 2546, + "Ġbody": 2547, + "resh": 2548, + "eturn": 2549, + "199": 2550, + "(_": 2551, + "yles": 2552, + "ormal": 2553, + "pi": 2554, + "Ġsomething": 2555, + "!--": 2556, + "uint": 2557, + "Ġprodu": 2558, + "Ġstand": 2559, + "Ġproble": 2560, + "Ġavailable": 2561, + "mt": 2562, + "ĠBl": 2563, + "Ġ...": 2564, + "Ġblock": 2565, + "Input": 2566, + "Ġkeep": 2567, + "Count": 2568, + "open": 2569, + "Ġ['": 2570, + "Ġthrow": 2571, + "uilder": 2572, + "Action": 2573, + "Ġthings": 2574, + "True": 2575, + "Ġurl": 2576, + "ĠBo": 2577, + "printf": 2578, + "Ġred": 2579, + "js": 2580, + ".create": 2581, + "ĠOr": 2582, + "Status": 2583, + "Instance": 2584, + "Ġcontrol": 2585, + "Ġcome": 2586, + "Ġcustom": 2587, + "location": 2588, + "07": 2589, + "model": 2590, + "ĠčĊ": 2591, + "Ġsource": 2592, + "Ġeas": 2593, + ".out": 2594, + "]ĊĊ": 2595, + "oney": 2596, + "Ġawait": 2597, + "Ġpartic": 2598, + "AP": 2599, + "ublish": 2600, + "odes": 2601, + "_pro": 2602, + "ply": 2603, + "riter": 2604, + "Ġprov": 2605, + "Ġmill": 2606, + "HT": 2607, + "])Ċ": 2608, + "Ġchang": 2609, + "Ġask": 2610, + "ĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠ": 2611, + "Ġoutput": 2612, + "Ġemail": 2613, + "68": 2614, + ".push": 2615, + "Ġ}čĊčĊ": 2616, + "ination": 2617, + "47": 2618, + "atrix": 2619, + "Table": 2620, + "uccess": 2621, + "]);Ċ": 2622, + "ĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠ": 2623, + "Ġdisc": 2624, + "([": 2625, + "Ġbusiness": 2626, + "height": 2627, + ".html": 2628, + "ta": 2629, + "field": 2630, + "Ġrequired": 2631, + "_R": 2632, + "Ġgovern": 2633, + "}čĊčĊ": 2634, + "lex": 2635, + "500": 2636, + ".,": 2637, + "ĠSet": 2638, + "urch": 2639, + "///": 2640, + "ts": 2641, + "af": 2642, + "Ġmight": 2643, + "istory": 2644, + "Str": 2645, + "Ġnever": 2646, + "Response": 2647, + "arse": 2648, + "ada": 2649, + "ĠHow": 2650, + "Ġ*)": 2651, + "Ġ;": 2652, + "Ġhard": 2653, + "Ad": 2654, + "Ġintern": 2655, + "used": 2656, + "(data": 2657, + "mod": 2658, + "annel": 2659, + "Ġnp": 2660, + "ugg": 2661, + "Ġ/>Ċ": 2662, + "Ġcalled": 2663, + "body": 2664, + "Ġcho": 2665, + "(r": 2666, + "_set": 2667, + "ird": 2668, + "Ġ>=": 2669, + "Ġ};Ċ": 2670, + "Ġoptions": 2671, + "ĠGener": 2672, + "Ġheight": 2673, + "Point": 2674, + "You": 2675, + "ety": 2676, + "Click": 2677, + "Ġsmall": 2678, + "Ġide": 2679, + "Ġaccess": 2680, + "anguage": 2681, + "Ġprotected": 2682, + "Ġjob": 2683, + "ĠThere": 2684, + "Def": 2685, + "Ġaddress": 2686, + "Ġuint": 2687, + "Not": 2688, + "oo": 2689, + "aps": 2690, + "
&": 5909, + "CON": 5910, + "Ġrepl": 5911, + "Ġregular": 5912, + "Storage": 5913, + "ramework": 5914, + "Ġgoal": 5915, + "Ġtouch": 5916, + ".widget": 5917, + "Ġbuilt": 5918, + "des": 5919, + "Part": 5920, + "(re": 5921, + "Ġworth": 5922, + "hib": 5923, + "game": 5924, + "91": 5925, + "192": 5926, + "Ġв": 5927, + "acion": 5928, + "ĠWhite": 5929, + "(type": 5930, + "(`": 5931, + "81": 5932, + "Ġnatural": 5933, + "Ġinj": 5934, + "Ġcalcul": 5935, + "ĠApril": 5936, + ".List": 5937, + "Ġassociated": 5938, + "ĉSystem": 5939, + "~~": 5940, + "=[": 5941, + "Ġstorage": 5942, + "Ġbytes": 5943, + "Ġtravel": 5944, + "Ġsou": 5945, + "Ġpassed": 5946, + "!=": 5947, + "ascript": 5948, + ".open": 5949, + "Ġgrid": 5950, + "Ġbus": 5951, + "Ġrecogn": 5952, + "Ab": 5953, + "Ġhon": 5954, + "ĠCenter": 5955, + "Ġprec": 5956, + "build": 5957, + "73": 5958, + "HTML": 5959, + "ĠSan": 5960, + "Ġcountries": 5961, + "aled": 5962, + "token": 5963, + "kt": 5964, + "Ġqual": 5965, + "Last": 5966, + "adow": 5967, + "Ġmanufact": 5968, + "idad": 5969, + "jango": 5970, + "Next": 5971, + "xf": 5972, + ".a": 5973, + "Ġporno": 5974, + "ĠPM": 5975, + "erve": 5976, + "iting": 5977, + "_th": 5978, + "ci": 5979, + "=None": 5980, + "gs": 5981, + "Ġlogin": 5982, + "atives": 5983, + "']);Ċ": 5984, + "Äħ": 5985, + "Ġill": 5986, + "IA": 5987, + "children": 5988, + "DO": 5989, + "Ġlevels": 5990, + "Ġ{{": 5991, + "Ġlooks": 5992, + "Ġ\"#": 5993, + "ToString": 5994, + "Ġnecessary": 5995, + "ĠĠĠĊ": 5996, + "cell": 5997, + "Entry": 5998, + "Ġ'#": 5999, + "Ġextrem": 6000, + "Selector": 6001, + "Ġplaceholder": 6002, + "Load": 6003, + "Ġreleased": 6004, + "ORE": 6005, + "Enumer": 6006, + "ĠTV": 6007, + "SET": 6008, + "inq": 6009, + "Press": 6010, + "ĠDepartment": 6011, + "Ġproperties": 6012, + "Ġrespond": 6013, + "Search": 6014, + "ael": 6015, + "Ġrequ": 6016, + "ĠBook": 6017, + "/Ċ": 6018, + "(st": 6019, + "Ġfinancial": 6020, + "icket": 6021, + "_input": 6022, + "Ġthreat": 6023, + "(in": 6024, + "Strip": 6025, + "ìĿ": 6026, + "ção": 6027, + "71": 6028, + "Ġevidence": 6029, + "));": 6030, + "ĠBro": 6031, + "Ġ[];Ċ": 6032, + "Ġou": 6033, + "buf": 6034, + "Script": 6035, + "dat": 6036, + "Ġrule": 6037, + "#import": 6038, + "=\"/": 6039, + "Serial": 6040, + "Ġstarting": 6041, + "[index": 6042, + "ae": 6043, + "Ġcontrib": 6044, + "session": 6045, + "_new": 6046, + "utable": 6047, + "ober": 6048, + "Ġ\"./": 6049, + "Ġlogger": 6050, + "Ġrecently": 6051, + "Ġreturned": 6052, + "ččĊ": 6053, + ")))Ċ": 6054, + "itions": 6055, + "Ġseek": 6056, + "Ġcommunic": 6057, + "Ġ\".": 6058, + "Ġusername": 6059, + "ECT": 6060, + "DS": 6061, + "Ġotherwise": 6062, + "ĠGerman": 6063, + ".aw": 6064, + "Adapter": 6065, + "ixel": 6066, + "Ġsystems": 6067, + "Ġdrop": 6068, + "83": 6069, + "Ġstructure": 6070, + "Ġ$(\"#": 6071, + "encies": 6072, + "anning": 6073, + "ĠLink": 6074, + "ĠResponse": 6075, + "Ġstri": 6076, + "ż": 6077, + "ĠDB": 6078, + "æĹ": 6079, + "android": 6080, + "submit": 6081, + "otion": 6082, + "92": 6083, + "(@": 6084, + ".test": 6085, + "82": 6086, + "ĊĊĊĊĊĊĊĊ": 6087, + "];čĊ": 6088, + "Ġdirectly": 6089, + "Ġ\"%": 6090, + "ris": 6091, + "elta": 6092, + "AIL": 6093, + "){čĊ": 6094, + "mine": 6095, + "ĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠ": 6096, + "(k": 6097, + "bon": 6098, + "asic": 6099, + "pite": 6100, + "___": 6101, + "Max": 6102, + "Ġerrors": 6103, + "ĠWhile": 6104, + "Ġarguments": 6105, + "Ġensure": 6106, + "Right": 6107, + "-based": 6108, + "Web": 6109, + "Ġ-=": 6110, + "Ġintrodu": 6111, + "ĠInst": 6112, + "ĠWash": 6113, + "ordin": 6114, + "join": 6115, + "Database": 6116, + "Ġgrad": 6117, + "Ġusually": 6118, + "ITE": 6119, + "Props": 6120, + "?>Ċ": 6121, + "ĠGo": 6122, + "@Override": 6123, + "REF": 6124, + "Ġip": 6125, + "ĠAustral": 6126, + "Ġist": 6127, + "ViewById": 6128, + "Ġserious": 6129, + "Ġcustomer": 6130, + ".prototype": 6131, + "odo": 6132, + "cor": 6133, + "Ġdoor": 6134, + "ĠWITHOUT": 6135, + "Ġplant": 6136, + "Ġbegan": 6137, + "Ġdistance": 6138, + "()).": 6139, + "Ġchance": 6140, + "Ġord": 6141, + "came": 6142, + "pragma": 6143, + "Ġprotect": 6144, + "ragment": 6145, + "ĠNode": 6146, + "ening": 6147, + "Ñĩ": 6148, + "Ġroute": 6149, + "ĠSchool": 6150, + "hi": 6151, + "Ġneighb": 6152, + "After": 6153, + "licit": 6154, + "Ġcontr": 6155, + "Ġprimary": 6156, + "AA": 6157, + ".WriteLine": 6158, + "utils": 6159, + "Ġbi": 6160, + "Red": 6161, + ".Linq": 6162, + ".object": 6163, + "Ġleaders": 6164, + "unities": 6165, + "Ġgun": 6166, + "onth": 6167, + "ĠDev": 6168, + "FILE": 6169, + "Ġcomments": 6170, + "_len": 6171, + "arrow": 6172, + "amount": 6173, + "Range": 6174, + "sert": 6175, + "GridView": 6176, + "Ġupdated": 6177, + "ĠMo": 6178, + "Ġinform": 6179, + "ociety": 6180, + "ala": 6181, + "Access": 6182, + "Ġhab": 6183, + "Ġcreat": 6184, + "_arg": 6185, + "ĠJanuary": 6186, + "ĠDay": 6187, + "\")čĊ": 6188, + "uple": 6189, + "document": 6190, + "gorith": 6191, + "menu": 6192, + "ĠOver": 6193, + "bb": 6194, + ".title": 6195, + "_out": 6196, + "Ġled": 6197, + "uri": 6198, + "Ġ?>": 6199, + "gl": 6200, + "Ġbank": 6201, + "ayment": 6202, + "ĉprintf": 6203, + "MD": 6204, + "Ġsample": 6205, + "Ġhands": 6206, + "ĠVersion": 6207, + "uario": 6208, + "Ġoffers": 6209, + "ityEngine": 6210, + "Ġshape": 6211, + "Ġsleep": 6212, + "_point": 6213, + "Settings": 6214, + "Ġachie": 6215, + "Ġsold": 6216, + "ota": 6217, + ".bind": 6218, + "Am": 6219, + "Ġsafe": 6220, + "Store": 6221, + "Ġshared": 6222, + "Ġpriv": 6223, + "_VAL": 6224, + "Ġsens": 6225, + "){": 6226, + "Ġremember": 6227, + "shared": 6228, + "element": 6229, + "Ġshoot": 6230, + "Vert": 6231, + "cout": 6232, + "Ġenv": 6233, + "_label": 6234, + "Ġ>Ċ": 6235, + "run": 6236, + "Ġscene": 6237, + "(array": 6238, + "device": 6239, + "_title": 6240, + "agon": 6241, + "]čĊ": 6242, + "aby": 6243, + "Ġbecame": 6244, + "boolean": 6245, + "Ġpark": 6246, + "ĠCode": 6247, + "upload": 6248, + "riday": 6249, + "ĠSeptember": 6250, + "Fe": 6251, + "Ġsen": 6252, + "cing": 6253, + "FL": 6254, + "Col": 6255, + "uts": 6256, + "_page": 6257, + "inn": 6258, + "Ġimplied": 6259, + "aling": 6260, + "Ġyourself": 6261, + ".Count": 6262, + "conf": 6263, + "Ġaud": 6264, + "_init": 6265, + ".)": 6266, + "Ġwrote": 6267, + "003": 6268, + "NG": 6269, + ".Error": 6270, + "ä»": 6271, + ".for": 6272, + "Ġequal": 6273, + "ĠRequest": 6274, + "Ġserial": 6275, + "Ġallows": 6276, + "XX": 6277, + "Ġmiddle": 6278, + "chor": 6279, + "195": 6280, + "94": 6281, + "ø": 6282, + "erval": 6283, + ".Column": 6284, + "reading": 6285, + "Ġescort": 6286, + "ĠAugust": 6287, + "Ġquickly": 6288, + "Ġweap": 6289, + "ĠCG": 6290, + "ropri": 6291, + "ho": 6292, + "Ġcop": 6293, + "(struct": 6294, + "ĠBig": 6295, + "Ġvs": 6296, + "Ġfrequ": 6297, + ".Value": 6298, + "Ġactions": 6299, + "Ġproper": 6300, + "Ġinn": 6301, + "Ġobjects": 6302, + "Ġmatrix": 6303, + "avascript": 6304, + "Ġones": 6305, + ".group": 6306, + "Ġgreen": 6307, + "Ġpaint": 6308, + "ools": 6309, + "ycl": 6310, + "encode": 6311, + "olt": 6312, + "comment": 6313, + ".api": 6314, + "Dir": 6315, + "Ġune": 6316, + "izont": 6317, + ".position": 6318, + "Ġdesigned": 6319, + "_val": 6320, + "avi": 6321, + "iring": 6322, + "tab": 6323, + "Ġlayer": 6324, + "Ġviews": 6325, + "Ġreve": 6326, + "rael": 6327, + "ĠON": 6328, + "rics": 6329, + "160": 6330, + "np": 6331, + "Ġcore": 6332, + "());čĊ": 6333, + "Main": 6334, + "Ġexpert": 6335, + "ĉĉčĊ": 6336, + "_en": 6337, + "Ġ/>": 6338, + "utter": 6339, + "IAL": 6340, + "ails": 6341, + "ĠKing": 6342, + "*/ĊĊ": 6343, + "ĠMet": 6344, + "_end": 6345, + "addr": 6346, + "ora": 6347, + "Ġir": 6348, + "Min": 6349, + "Ġsurpr": 6350, + "Ġrepe": 6351, + "Ġdirectory": 6352, + "PUT": 6353, + "-S": 6354, + "Ġelection": 6355, + "haps": 6356, + ".pre": 6357, + "cm": 6358, + "Values": 6359, + "Ġ\"Ċ": 6360, + "column": 6361, + "ivil": 6362, + "Login": 6363, + "inue": 6364, + "93": 6365, + "Ġbeautiful": 6366, + "Ġsecret": 6367, + "(event": 6368, + "Ġchat": 6369, + "ums": 6370, + "Ġorigin": 6371, + "Ġeffects": 6372, + "Ġmanagement": 6373, + "illa": 6374, + "tk": 6375, + "Ġsetting": 6376, + "ĠCour": 6377, + "Ġmassage": 6378, + "ĉend": 6379, + "Ġhappy": 6380, + "Ġfinish": 6381, + "Ġcamera": 6382, + "ĠVer": 6383, + "ĠDemocr": 6384, + "ĠHer": 6385, + "(Q": 6386, + "cons": 6387, + "ita": 6388, + "Ġ'.": 6389, + "{}": 6390, + "ĉC": 6391, + "Ġstuff": 6392, + "194": 6393, + "Ġ:Ċ": 6394, + "ĠAR": 6395, + "Task": 6396, + "hidden": 6397, + "eros": 6398, + "IGN": 6399, + "atio": 6400, + "ĠHealth": 6401, + "olute": 6402, + "Enter": 6403, + "'>": 6404, + "ĠTwitter": 6405, + "ĠCounty": 6406, + "scribe": 6407, + "Ġ=>Ċ": 6408, + "Ġhy": 6409, + "fit": 6410, + "Ġmilitary": 6411, + "Ġsale": 6412, + "required": 6413, + "non": 6414, + "bootstrap": 6415, + "hold": 6416, + "rim": 6417, + "-old": 6418, + "ĠDown": 6419, + "Ġmention": 6420, + "contact": 6421, + "_group": 6422, + "oday": 6423, + "Ġtown": 6424, + "Ġsolution": 6425, + "uate": 6426, + "elling": 6427, + "]->": 6428, + "otes": 6429, + "ental": 6430, + "omen": 6431, + "ospital": 6432, + "ĠSup": 6433, + "_EN": 6434, + "Ġslow": 6435, + "SESSION": 6436, + "Ġblue": 6437, + "ago": 6438, + "Ġlives": 6439, + "Ġ^": 6440, + ".un": 6441, + "inst": 6442, + "enge": 6443, + "Ġcustomers": 6444, + "Ġcast": 6445, + "udget": 6446, + "ï¼ģ": 6447, + "icens": 6448, + "Ġdetermin": 6449, + "Selected": 6450, + "_pl": 6451, + "ueue": 6452, + "Ġdark": 6453, + "//ĊĊ": 6454, + "si": 6455, + "thern": 6456, + "ĠJapan": 6457, + "/w": 6458, + "PU": 6459, + "ĠEast": 6460, + "ovie": 6461, + "Ġpackage": 6462, + "Ġnor": 6463, + "Ġapi": 6464, + "bot": 6465, + "\"];Ċ": 6466, + "_post": 6467, + "ulate": 6468, + "Ġclub": 6469, + "'));Ċ": 6470, + "Ġloop": 6471, + "PIO": 6472, + "ione": 6473, + "shot": 6474, + "Initial": 6475, + "Ġplayed": 6476, + "register": 6477, + "rought": 6478, + "_max": 6479, + "acement": 6480, + "match": 6481, + "raphics": 6482, + "AST": 6483, + "Ġexisting": 6484, + "Ġcomplex": 6485, + "DA": 6486, + ".Ch": 6487, + ".common": 6488, + "mo": 6489, + "Ġ'../../": 6490, + "ito": 6491, + "Ġanalysis": 6492, + "Ġdeliver": 6493, + "ĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĊ": 6494, + "idx": 6495, + "Ãł": 6496, + "ongo": 6497, + "ĠEnglish": 6498, + "Ċ": 10197, + "_default": 10198, + "ĠDatabase": 10199, + "rep": 10200, + "ESS": 10201, + "nergy": 10202, + ".Find": 10203, + "_mask": 10204, + "Ġrise": 10205, + "Ġkernel": 10206, + "::$": 10207, + ".Q": 10208, + "Ġoffering": 10209, + "decl": 10210, + "ĠCS": 10211, + "Ġlisted": 10212, + "Ġmostly": 10213, + "enger": 10214, + "Ġblocks": 10215, + "olo": 10216, + "Ġgoverning": 10217, + "\\F": 10218, + "Ġconcent": 10219, + ".getText": 10220, + "Ġmb": 10221, + "Ġoccurred": 10222, + "Ġchanging": 10223, + "Scene": 10224, + "_CODE": 10225, + "Beh": 10226, + "\"The": 10227, + "Ġtile": 10228, + "ĠAssociation": 10229, + "ĉP": 10230, + "alty": 10231, + "_ad": 10232, + "odies": 10233, + "iated": 10234, + "Ġprepared": 10235, + "possible": 10236, + "Ġmort": 10237, + "TEST": 10238, + "142": 10239, + "Ġignore": 10240, + "Ġcalc": 10241, + "Ġrs": 10242, + "ĠassertEquals": 10243, + "Ġsz": 10244, + "ĠTHIS": 10245, + ".\"Ċ": 10246, + "Ġcanvas": 10247, + "java": 10248, + "Ġdut": 10249, + "VALID": 10250, + ".sql": 10251, + ".input": 10252, + "Ġaux": 10253, + "Sup": 10254, + "Ġartist": 10255, + "Vec": 10256, + "_TIME": 10257, + ".stringify": 10258, + "etween": 10259, + "ĠCategory": 10260, + "Ġ[-": 10261, + "ĠDevExpress": 10262, + "ĠJul": 10263, + "Ġring": 10264, + ".ed": 10265, + "YY": 10266, + "Let": 10267, + "TextField": 10268, + "Ġflat": 10269, + "_print": 10270, + "ĠOTHER": 10271, + "adian": 10272, + "Ġchecked": 10273, + "ele": 10274, + "Align": 10275, + "standing": 10276, + "Ġ[],": 10277, + "Ġlab": 10278, + "ucky": 10279, + "ĠChristmas": 10280, + "(image": 10281, + ".module": 10282, + "Ġlots": 10283, + "Ġslightly": 10284, + "(final": 10285, + "erge": 10286, + "è¿": 10287, + "147": 10288, + "ĠPolice": 10289, + "143": 10290, + "ĠRight": 10291, + "Ġaward": 10292, + "ĠOS": 10293, + "Ġ{}ĊĊ": 10294, + "Ġptr": 10295, + "oves": 10296, + "icated": 10297, + "ем": 10298, + "Ġmanage": 10299, + "oliday": 10300, + "Amount": 10301, + "oolStrip": 10302, + "tbody": 10303, + "Nav": 10304, + "wrap": 10305, + "BB": 10306, + "Ġwatching": 10307, + "arios": 10308, + "Ġoptional": 10309, + "_K": 10310, + "ĠLicensed": 10311, + ".Map": 10312, + "Timer": 10313, + "ĠAP": 10314, + "ĠRev": 10315, + "(o": 10316, + ",c": 10317, + "umin": 10318, + "etailed": 10319, + "ĠHy": 10320, + "Ġblank": 10321, + "agger": 10322, + "ĠSelf": 10323, + "()[": 10324, + ".make": 10325, + "earn": 10326, + "channel": 10327, + ";Ċ": 10342, + "World": 10343, + "Ġpython": 10344, + "Ġlif": 10345, + "Ġtrav": 10346, + "Ġconven": 10347, + "company": 10348, + "ĠClub": 10349, + "138": 10350, + "Ver": 10351, + "Btn": 10352, + "Ġzone": 10353, + "products": 10354, + "ĠEduc": 10355, + "Ġverify": 10356, + "ĠMil": 10357, + "ono": 10358, + "]);ĊĊ": 10359, + "ENCE": 10360, + "Ġpacket": 10361, + "Ġcer": 10362, + "Ġenumer": 10363, + "Ġpars": 10364, + "formed": 10365, + "Ġoccup": 10366, + "tre": 10367, + "Ġexercise": 10368, + "Day": 10369, + "_sum": 10370, + "Ġasking": 10371, + "aption": 10372, + "Ġorders": 10373, + "Ġspending": 10374, + "ĠERR": 10375, + ".Dis": 10376, + "ĠUtil": 10377, + "âĢľI": 10378, + "\\'": 10379, + "?)": 10380, + "/>Ċ": 10381, + "Ġemot": 10382, + "Ġinfluence": 10383, + "ĠAfrica": 10384, + "atters": 10385, + "Ùħ": 10386, + ".session": 10387, + "Ġchief": 10388, + "ĉĉĉĉĉĉĉĉĉĉĉ": 10389, + "Ġtom": 10390, + "cluded": 10391, + "serial": 10392, + "_handler": 10393, + ".Type": 10394, + "aped": 10395, + "Ġpolicies": 10396, + "-ex": 10397, + "-tr": 10398, + "blank": 10399, + "merce": 10400, + "Ġcoverage": 10401, + "Ġrc": 10402, + "_matrix": 10403, + "_box": 10404, + "Ġcharges": 10405, + "ĠBoston": 10406, + "Pe": 10407, + "Ġcircum": 10408, + "Ġfilled": 10409, + "148": 10410, + "Ġnorth": 10411, + "ictureBox": 10412, + "ĉres": 10413, + "è®": 10414, + "Ġtermin": 10415, + "Ġ[â̦": 10416, + "IRECT": 10417, + "Ġber": 10418, + "Ġ\"../../": 10419, + "retch": 10420, + ".code": 10421, + "_col": 10422, + "ĠGovernment": 10423, + "Ġargv": 10424, + "ĠLord": 10425, + "asi": 10426, + "Exec": 10427, + "ĉlet": 10428, + "vertis": 10429, + "Ġdiscussion": 10430, + "enance": 10431, + "outube": 10432, + "typeof": 10433, + "Ġserved": 10434, + "ĠPut": 10435, + "ĉx": 10436, + "Ġsweet": 10437, + "Before": 10438, + "ategy": 10439, + ".of": 10440, + "ĠMaterial": 10441, + "Sort": 10442, + "ONT": 10443, + "igital": 10444, + "Why": 10445, + "Ġsust": 10446, + "Ġç": 10447, + "abet": 10448, + "Ġsegment": 10449, + "Ġ[],Ċ": 10450, + "ĠMuslim": 10451, + "ĠfindViewById": 10452, + "cut": 10453, + "_TEXT": 10454, + "ĠMary": 10455, + "Ġloved": 10456, + "Ġlie": 10457, + "ĠJO": 10458, + "Ġisset": 10459, + "month": 10460, + "Ġprime": 10461, + "ti": 10462, + "ĠCarol": 10463, + "Use": 10464, + "146": 10465, + "ĠPop": 10466, + "ĠSave": 10467, + "Interval": 10468, + "execute": 10469, + "dy": 10470, + "ĠIran": 10471, + "_cont": 10472, + "ĉT": 10473, + "Ġphase": 10474, + "checkbox": 10475, + "week": 10476, + "Ġhide": 10477, + "Ġtil": 10478, + "Ġju": 10479, + "Custom": 10480, + "burg": 10481, + "/M": 10482, + "TON": 10483, + "Ġquant": 10484, + "Ġrub": 10485, + "ixels": 10486, + "Ġinstalled": 10487, + "Ġdump": 10488, + "Ġproperly": 10489, + "(List": 10490, + "Ġdecide": 10491, + "apply": 10492, + "Has": 10493, + "Ġkeeping": 10494, + "Ġcitizens": 10495, + "Ġjoint": 10496, + "pool": 10497, + "Socket": 10498, + "_op": 10499, + "Ġweapon": 10500, + "gnore": 10501, + "ĠExec": 10502, + "otten": 10503, + "ĠMS": 10504, + "Ġ(-": 10505, + "ĠReview": 10506, + "Ġexamples": 10507, + "Ġtight": 10508, + "!(": 10509, + "DP": 10510, + "ĠMessageBox": 10511, + "Ġphotograph": 10512, + "164": 10513, + "URI": 10514, + "ét": 10515, + "low": 10516, + "ĠGrand": 10517, + ".persistence": 10518, + "Ġmaintain": 10519, + "Ġnums": 10520, + "Ġzip": 10521, + "ials": 10522, + "ĠGets": 10523, + "peg": 10524, + "ĠBuffer": 10525, + "~~~~": 10526, + "rastructure": 10527, + "ĠPL": 10528, + "uen": 10529, + "obby": 10530, + "sizeof": 10531, + "Ġpic": 10532, + "Ġseed": 10533, + "Ġexperienced": 10534, + "Ġodd": 10535, + "Ġkick": 10536, + "Ġprocedure": 10537, + "avigator": 10538, + "-on": 10539, + ",j": 10540, + "ĠAlthough": 10541, + "ĠuserId": 10542, + "accept": 10543, + "Blue": 10544, + "IColor": 10545, + "layer": 10546, + "available": 10547, + "Ġends": 10548, + ".table": 10549, + "Ġdataset": 10550, + "bus": 10551, + "Ġexplain": 10552, + "(pro": 10553, + "ĠCommittee": 10554, + "Ġnoted": 10555, + "]:Ċ": 10556, + "Dim": 10557, + "stdio": 10558, + "154": 10559, + ".\",Ċ": 10560, + "_source": 10561, + "181": 10562, + "ĠWeek": 10563, + "ĠEdge": 10564, + "Ġoperating": 10565, + "Ġeste": 10566, + "ipl": 10567, + "330": 10568, + "agination": 10569, + "Ġproceed": 10570, + "Ġanimation": 10571, + ".Models": 10572, + "ĠWatch": 10573, + "iat": 10574, + "Ġoppon": 10575, + "/A": 10576, + "Report": 10577, + "Ġsounds": 10578, + "_buf": 10579, + "IELD": 10580, + "Ġbund": 10581, + "ĉget": 10582, + ".pr": 10583, + "(tmp": 10584, + "Ġkid": 10585, + ">ĊĊĊ": 10586, + "Ġyang": 10587, + "NotFound": 10588, + "ÑĨ": 10589, + "math": 10590, + "@gmail": 10591, + "ĠLIMIT": 10592, + "redients": 10593, + "Ġvent": 10594, + "avigate": 10595, + "Look": 10596, + "Ġreligious": 10597, + "Ġrand": 10598, + "rio": 10599, + "(GL": 10600, + "_ip": 10601, + "uan": 10602, + "iciency": 10603, + "ĠChange": 10604, + ">čĊčĊ": 10605, + "ĠEntity": 10606, + "Ġrencontre": 10607, + "ĠRet": 10608, + "plan": 10609, + "én": 10610, + "BOOL": 10611, + "uries": 10612, + "train": 10613, + "Definition": 10614, + "============": 10615, + "zz": 10616, + "450": 10617, + "Animation": 10618, + "ĠOK": 10619, + "_menu": 10620, + ".bl": 10621, + "_score": 10622, + "Ġacad": 10623, + "(System": 10624, + "Ġrefresh": 10625, + "'=>$": 10626, + ".Graphics": 10627, + "amento": 10628, + "pid": 10629, + "tc": 10630, + "Ġtips": 10631, + "Ġhomes": 10632, + "Ġfuel": 10633, + "âĸ": 10634, + "_helper": 10635, + "ĠĠčĊ": 10636, + "ĠRoom": 10637, + ".Close": 10638, + "_attr": 10639, + "ĠMount": 10640, + "ĠEv": 10641, + "arser": 10642, + "_top": 10643, + "eah": 10644, + "ĠDelete": 10645, + "ãĢį": 10646, + "uke": 10647, + "Ġusage": 10648, + "aria": 10649, + "_dev": 10650, + "Ġtexture": 10651, + "Ġconversation": 10652, + "eper": 10653, + "Bean": 10654, + "done": 10655, + "nonatomic": 10656, + "ĠSecond": 10657, + "Ġshooting": 10658, + "_pre": 10659, + "Components": 10660, + "Ġ]ĊĊ": 10661, + "__,": 10662, + "stitution": 10663, + ".Char": 10664, + ">();ĊĊ": 10665, + "Ġpresented": 10666, + "Ġwa": 10667, + "oker": 10668, + "-ĊĊ": 10669, + "iner": 10670, + "Ġbecoming": 10671, + "Ġincident": 10672, + "Att": 10673, + "162": 10674, + "Ġrevealed": 10675, + "forc": 10676, + "Ġboot": 10677, + ".page": 10678, + "Enumerator": 10679, + "165": 10680, + "_->": 10681, + "Photo": 10682, + "Ġspring": 10683, + ".\",": 10684, + "ĠDictionary": 10685, + "BJECT": 10686, + "Ġlocations": 10687, + "Ġsamples": 10688, + "InputStream": 10689, + "ĠBrown": 10690, + "Ġstats": 10691, + "quality": 10692, + "Ñħ": 10693, + "-dis": 10694, + "Ġhelping": 10695, + "Ġped": 10696, + "224": 10697, + "(se": 10698, + "ĠWho": 10699, + "alian": 10700, + "internal": 10701, + "Ġft": 10702, + ">().": 10703, + "->{": 10704, + "Ġmine": 10705, + "Ġsector": 10706, + "Ġgro": 10707, + "Ġopportunities": 10708, + "Ġü": 10709, + "Ġmp": 10710, + "Ġalleged": 10711, + "Ġdoubt": 10712, + "Mouse": 10713, + "About": 10714, + "_part": 10715, + "Ġchair": 10716, + "Ġstopped": 10717, + "161": 10718, + "loop": 10719, + "entities": 10720, + "Ġapps": 10721, + "ansion": 10722, + "Ġmental": 10723, + "ĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠ": 10724, + "FR": 10725, + "Ġdefend": 10726, + "care": 10727, + "Ġideal": 10728, + "/api": 10729, + "urface": 10730, + "011": 10731, + "Ġele": 10732, + "ulator": 10733, + "ĠRights": 10734, + "anguages": 10735, + "Ġfunds": 10736, + "Ġadapt": 10737, + "Attributes": 10738, + "Ġdeploy": 10739, + "opts": 10740, + "Ġvalidation": 10741, + "Ġconcerns": 10742, + "uce": 10743, + ".num": 10744, + "ulture": 10745, + "ila": 10746, + "Ġcup": 10747, + "Ġpure": 10748, + ".Fore": 10749, + "183": 10750, + "ĠHashMap": 10751, + ".valueOf": 10752, + "asm": 10753, + "MO": 10754, + "Ġcs": 10755, + "Ġstores": 10756, + "Ġ************************************************************************": 10757, + "Ġcommunication": 10758, + "mem": 10759, + ".EventHandler": 10760, + ".Status": 10761, + "_right": 10762, + ".setOn": 10763, + "Sheet": 10764, + "Ġidentify": 10765, + "enerated": 10766, + "ordered": 10767, + "Ġ\"[": 10768, + "Ġswe": 10769, + "Condition": 10770, + "ĠAccording": 10771, + "Ġprepare": 10772, + "Ġrob": 10773, + "Pool": 10774, + "Ġsport": 10775, + "rv": 10776, + "ĠRouter": 10777, + "Ġalternative": 10778, + "([]": 10779, + "ĠChicago": 10780, + "ipher": 10781, + "ische": 10782, + "ĠDirector": 10783, + "kl": 10784, + "ĠWil": 10785, + "keys": 10786, + "Ġmysql": 10787, + "Ġwelcome": 10788, + "king": 10789, + "ĠManager": 10790, + "Ġcaught": 10791, + ")}Ċ": 10792, + "Score": 10793, + "_PR": 10794, + "Ġsurvey": 10795, + "hab": 10796, + "Headers": 10797, + "ADER": 10798, + "Ġdecor": 10799, + "Ġturns": 10800, + "Ġradius": 10801, + "errupt": 10802, + "Cor": 10803, + "Ġmel": 10804, + "Ġintr": 10805, + "(q": 10806, + "ĠAC": 10807, + "amos": 10808, + "MAX": 10809, + "ĠGrid": 10810, + "ĠJesus": 10811, + "ĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠ": 10812, + ".DE": 10813, + "Ġts": 10814, + "Ġlinked": 10815, + "free": 10816, + "ĠQt": 10817, + "Ġ/**čĊ": 10818, + "Ġfaster": 10819, + "ctr": 10820, + "_J": 10821, + "DT": 10822, + ".Check": 10823, + "Ġcombination": 10824, + "Ġintended": 10825, + "-the": 10826, + "-type": 10827, + "182": 10828, + "ectors": 10829, + "ami": 10830, + "uting": 10831, + "Ġuma": 10832, + "XML": 10833, + "UCT": 10834, + "Ap": 10835, + "ĠRandom": 10836, + "Ġran": 10837, + ".sort": 10838, + "Ġsorted": 10839, + ".Un": 10840, + "401": 10841, + "_PER": 10842, + "itory": 10843, + "Ġpriority": 10844, + "ĠGal": 10845, + "ĠOld": 10846, + "hot": 10847, + "ĠDisplay": 10848, + "(sub": 10849, + "_TH": 10850, + "_Y": 10851, + "ĠCare": 10852, + "loading": 10853, + "Kind": 10854, + "_handle": 10855, + ",,": 10856, + "rase": 10857, + "_replace": 10858, + ".addEventListener": 10859, + "ĠRT": 10860, + "172": 10861, + "Ġentered": 10862, + "gers": 10863, + "Ġich": 10864, + "(start": 10865, + "205": 10866, + "/app": 10867, + "Ġbrother": 10868, + "Memory": 10869, + "Outlet": 10870, + "Ġutf": 10871, + "prec": 10872, + "Ġnavigation": 10873, + "ORK": 10874, + "Ġdst": 10875, + "Detail": 10876, + "Ġaudience": 10877, + "Ġdur": 10878, + "Ġcluster": 10879, + "unched": 10880, + "Ġ],": 10881, + "Ġcomfortable": 10882, + ".values": 10883, + "ĠTotal": 10884, + "Ġsnap": 10885, + "Ġstandards": 10886, + "Ġperformed": 10887, + "hand": 10888, + "(\"@": 10889, + "åŃ": 10890, + "Ġphil": 10891, + "ibr": 10892, + "trim": 10893, + "Ġforget": 10894, + "157": 10895, + "Ġdoctor": 10896, + ".TextBox": 10897, + "377": 10898, + "icons": 10899, + ",s": 10900, + "ĠOp": 10901, + "Sm": 10902, + "Stop": 10903, + "ĉList": 10904, + "ĉu": 10905, + "Comment": 10906, + "_VERSION": 10907, + ".Xtra": 10908, + "Person": 10909, + "rb": 10910, + "LOB": 10911, + "ĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĊ": 10912, + "ĠCentral": 10913, + "270": 10914, + "ICK": 10915, + "raq": 10916, + "Ġputting": 10917, + "Ġmd": 10918, + "ĠLove": 10919, + "Program": 10920, + "Border": 10921, + "oor": 10922, + "Ġallowing": 10923, + "after": 10924, + "Ġentries": 10925, + "ĠMaybe": 10926, + "]).": 10927, + "ĠShort": 10928, + ")\\": 10929, + ".now": 10930, + "friend": 10931, + "Ġprefer": 10932, + "ĠGPIO": 10933, + "osis": 10934, + "ĠGameObject": 10935, + "Ġskip": 10936, + "Ġcompetition": 10937, + "_match": 10938, + "lications": 10939, + "_CONT": 10940, + ".groupBox": 10941, + "Ġals": 10942, + "666": 10943, + "\"We": 10944, + "_eq": 10945, + "lan": 10946, + "_search": 10947, + "ĠMusic": 10948, + "asis": 10949, + "Ġbind": 10950, + "ĠIsland": 10951, + "rum": 10952, + "(E": 10953, + "Ġseat": 10954, + "Video": 10955, + "Ġack": 10956, + "reek": 10957, + "={()": 10958, + "Ġrating": 10959, + "Ġrestaurant": 10960, + "456": 10961, + "DEX": 10962, + "(buf": 10963, + "pping": 10964, + "uality": 10965, + "Ġleague": 10966, + "176": 10967, + "Ġfocused": 10968, + "apon": 10969, + "$data": 10970, + "CLUD": 10971, + "CLUDING": 10972, + "Ġabsolute": 10973, + "(query": 10974, + "Ġtells": 10975, + "Ang": 10976, + "Ġcommunities": 10977, + "Ġhonest": 10978, + "oking": 10979, + "Ġapart": 10980, + "arity": 10981, + "/$": 10982, + "_module": 10983, + "ĠEnc": 10984, + ".an": 10985, + ".Config": 10986, + "Cre": 10987, + "Ġshock": 10988, + "ĠArab": 10989, + "IENT": 10990, + "/re": 10991, + "Ġretrie": 10992, + "ycler": 10993, + "isa": 10994, + "ĠOrgan": 10995, + ".graph": 10996, + "Ġí": 10997, + "ĠBAS": 10998, + "Enum": 10999, + "Ġpossibly": 11000, + "ÑĢаÐ": 11001, + "ĠJapanese": 11002, + "Ġcraft": 11003, + "ĠPlace": 11004, + "Ġtalent": 11005, + "Ġfunding": 11006, + "Ġconfirmed": 11007, + "Ġcycle": 11008, + "/x": 11009, + "GE": 11010, + "Ġhearing": 11011, + "Ġplants": 11012, + "Ġmouth": 11013, + "pages": 11014, + "oria": 11015, + "ĠRemove": 11016, + "_total": 11017, + "Ġod": 11018, + "ollapse": 11019, + "door": 11020, + "Ġbought": 11021, + "Ġaddr": 11022, + "ARCH": 11023, + "_dim": 11024, + "dden": 11025, + "Ġdecades": 11026, + "REQUEST": 11027, + "Ġversions": 11028, + "fire": 11029, + "006": 11030, + "Ġmoves": 11031, + "fb": 11032, + "Ġcoffee": 11033, + ".connect": 11034, + "ĠRow": 11035, + "Ġschema": 11036, + "Scope": 11037, + "-Type": 11038, + "Ġfighting": 11039, + "Ġretail": 11040, + "Ġmodified": 11041, + "TF": 11042, + "Files": 11043, + "nie": 11044, + "_command": 11045, + "stone": 11046, + "ĠÑĤ": 11047, + "_thread": 11048, + "Ġbond": 11049, + "ĠDevelopment": 11050, + "Ġpt": 11051, + "FORM": 11052, + "plet": 11053, + "Ġidentified": 11054, + "cpp": 11055, + "206": 11056, + "225": 11057, + "Ġcoding": 11058, + "oked": 11059, + "ĠMaster": 11060, + "IDTH": 11061, + "Ġresidents": 11062, + "redit": 11063, + "ĠPhoto": 11064, + "=-": 11065, + "unte": 11066, + "ateur": 11067, + "159": 11068, + "_STATE": 11069, + "ĠSing": 11070, + "Ġsheet": 11071, + ".val": 11072, + "orse": 11073, + "Ġhers": 11074, + "Ġdetermined": 11075, + "Common": 11076, + "Ġwed": 11077, + "_queue": 11078, + "PH": 11079, + "ĠAtl": 11080, + "cred": 11081, + "/LICENSE": 11082, + "Ġmes": 11083, + "Ġadvanced": 11084, + ".java": 11085, + ".Sh": 11086, + "Go": 11087, + "kill": 11088, + "fp": 11089, + "_settings": 11090, + "Ġpal": 11091, + "Ġtruck": 11092, + "Ġcombined": 11093, + "Ġ\"${": 11094, + "ĠCorpor": 11095, + "Ġjoined": 11096, + "ĠJose": 11097, + "ĠCup": 11098, + "uns": 11099, + "estival": 11100, + "levision": 11101, + "Ġbroken": 11102, + "Ġmarriage": 11103, + "ĠWestern": 11104, + "Ġrepresents": 11105, + "ĠTitle": 11106, + "Ġss": 11107, + ".Ass": 11108, + "ongoose": 11109, + "iento": 11110, + "<>();Ċ": 11111, + "Ġabsolutely": 11112, + "Ġsmooth": 11113, + "TERN": 11114, + "ĠUnless": 11115, + "Word": 11116, + "Ġmerge": 11117, + "igan": 11118, + "ĠVol": 11119, + "Ġnn": 11120, + ".getId": 11121, + "Ġз": 11122, + "171": 11123, + "Ġsexy": 11124, + "Ġseeking": 11125, + "Single": 11126, + ".this": 11127, + "179": 11128, + "Ġkom": 11129, + "bound": 11130, + ";\"": 11131, + "ĠfontSize": 11132, + "_df": 11133, + "Ġinjury": 11134, + "(H": 11135, + "Ġissued": 11136, + "_END": 11137, + ":self": 11138, + "020": 11139, + "Ġpatch": 11140, + "Ġleaves": 11141, + "Ġadopt": 11142, + "FileName": 11143, + "ãĢIJ": 11144, + "Ġexecutive": 11145, + "ĠByte": 11146, + "]))Ċ": 11147, + "Ġnu": 11148, + "outing": 11149, + "cluding": 11150, + "-R": 11151, + ".options": 11152, + "Ġsubstant": 11153, + "avax": 11154, + "ĠBUT": 11155, + "Ġtechnical": 11156, + "Ġtwice": 11157, + "Ġmás": 11158, + "Ġunivers": 11159, + "yr": 11160, + "Ġdrag": 11161, + "ĠDC": 11162, + "Ġsed": 11163, + "Ġbot": 11164, + "ĠPal": 11165, + "ĠHall": 11166, + "forcement": 11167, + "Ġauch": 11168, + ".mod": 11169, + "notation": 11170, + "_files": 11171, + ".line": 11172, + "_flag": 11173, + "[name": 11174, + "Ġresolution": 11175, + "Ġbott": 11176, + "(\"[": 11177, + "ende": 11178, + "(arr": 11179, + "Free": 11180, + "(@\"": 11181, + "ĠDistrict": 11182, + "PEC": 11183, + ":-": 11184, + "Picker": 11185, + "ĠJo": 11186, + "ĠĠĠĠĠĊ": 11187, + "ĠRiver": 11188, + "_rows": 11189, + "Ġhelpful": 11190, + "Ġmassive": 11191, + "---Ċ": 11192, + "Ġmeasures": 11193, + "007": 11194, + "ĠRuntime": 11195, + "Ġworry": 11196, + "ĠSpec": 11197, + "ĉD": 11198, + "ãĢij": 11199, + "Ġ){Ċ": 11200, + "Ġworse": 11201, + "(filename": 11202, + "Ġlay": 11203, + "Ġmagic": 11204, + "ĠTheir": 11205, + "oul": 11206, + "stroy": 11207, + "ĠWhere": 11208, + "280": 11209, + "Ġsudden": 11210, + "Ġdefe": 11211, + "Ġbinding": 11212, + "Ġflight": 11213, + "ĠOnInit": 11214, + "ĠWomen": 11215, + "ĠPolicy": 11216, + "Ġdrugs": 11217, + "ishing": 11218, + "('../": 11219, + "ĠMel": 11220, + "peat": 11221, + "tor": 11222, + "Ġproposed": 11223, + "Ġstated": 11224, + "_RES": 11225, + "Ġeast": 11226, + "212": 11227, + "ĠCONDITION": 11228, + "_desc": 11229, + "Ġwinning": 11230, + "folio": 11231, + "Mapper": 11232, + "ĠPan": 11233, + "ĠAnge": 11234, + ".servlet": 11235, + "Ġcopies": 11236, + "LM": 11237, + "Ġvm": 11238, + "åį": 11239, + "Ġdictionary": 11240, + "Seg": 11241, + "177": 11242, + "elines": 11243, + "ĠSend": 11244, + "Ġiron": 11245, + "ĠFort": 11246, + "166": 11247, + ".domain": 11248, + "Ġdebate": 11249, + "NotNull": 11250, + "eq": 11251, + "acher": 11252, + "lf": 11253, + "ĉfmt": 11254, + "Ġlawy": 11255, + "178": 11256, + "ÄŁ": 11257, + "ĠMen": 11258, + "Ġtrim": 11259, + "(NULL": 11260, + "Ġ!!": 11261, + "Ġpad": 11262, + "Ġfollows": 11263, + "\"][\"": 11264, + "requ": 11265, + "ĠEp": 11266, + ".github": 11267, + "(img": 11268, + "eto": 11269, + "('\\": 11270, + "Services": 11271, + "umbnail": 11272, + "_main": 11273, + "pleted": 11274, + "fortunately": 11275, + "Ġwindows": 11276, + "Ġplane": 11277, + "ĠConnection": 11278, + ".local": 11279, + "uard": 11280, + "}\\": 11281, + "==\"": 11282, + "andon": 11283, + "ĠRoy": 11284, + "west": 11285, + "158": 11286, + "iginal": 11287, + "emies": 11288, + "itz": 11289, + "'):Ċ": 11290, + "ĠPeter": 11291, + "Ġtough": 11292, + "Ġreduced": 11293, + "Ġcalculate": 11294, + "Ġrapid": 11295, + "customer": 11296, + "Ġefficient": 11297, + "Ġmedium": 11298, + "Ġfell": 11299, + ".ref": 11300, + "ĠCas": 11301, + "Ġfeedback": 11302, + "Speed": 11303, + "(output": 11304, + "aje": 11305, + "Ġcategories": 11306, + "Ġfee": 11307, + "};": 11308, + "Ġdeleted": 11309, + "reh": 11310, + "Ġproof": 11311, + "Desc": 11312, + "Build": 11313, + "Ġsides": 11314, + ".ArrayList": 11315, + "-%": 11316, + "ĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠĠ": 11317, + "ر": 11318, + ".match": 11319, + "ли": 11320, + "Ġfeels": 11321, + "Ġachieve": 11322, + "Ġclim": 11323, + "_ON": 11324, + "ĠCD": 11325, + "Ġteacher": 11326, + "_current": 11327, + "bn": 11328, + "_PL": 11329, + "isting": 11330, + "Enable": 11331, + "GEN": 11332, + "Ġtv": 11333, + "Ġsock": 11334, + "Ġplays": 11335, + "Ġdiscount": 11336, + "ĠKE": 11337, + "ĠDebug": 11338, + "Fore": 11339, + "ĠIraq": 11340, + "Ġappearance": 11341, + "Mon": 11342, + "Ġstyled": 11343, + "ĠHuman": 11344, + "iot": 11345, + "ĠHistory": 11346, + "Ġsac": 11347, + "ĠCollection": 11348, + "Ġrecommended": 11349, + ".Selected": 11350, + "Ġorganizations": 11351, + "Ġdiscovered": 11352, + "cohol": 11353, + "adas": 11354, + "ĠThomas": 11355, + "May": 11356, + "Ġconserv": 11357, + "Ġdomin": 11358, + "ĠFollow": 11359, + "ĠSection": 11360, + "ĠThanks": 11361, + "Username": 11362, + "Ġrecipe": 11363, + "Ġwonderful": 11364, + ".sleep": 11365, + "_if": 11366, + "ĉĊĉĊ": 11367, + "orno": 11368, + "Ġru": 11369, + "_target": 11370, + ".\"\"": 11371, + "à¦": 11372, + "EventArgs": 11373, + "Ġinputs": 11374, + "Ġfif": 11375, + "Ġvision": 11376, + "cy": 11377, + "ĠSeries": 11378, + ")(((": 11379, + "Ġtrading": 11380, + "Ġmarker": 11381, + "Begin": 11382, + "Ġtypically": 11383, + "Ġcauses": 11384, + "dropdown": 11385, + "_DEBUG": 11386, + "260": 11387, + "Ġdetect": 11388, + "country": 11389, + "!\");Ċ": 11390, + "ĉR": 11391, + "appy": 11392, + "Ġcref": 11393, + "('<": 11394, + "\"=>": 11395, + "ĠLE": 11396, + "reader": 11397, + "Ġadministr": 11398, + "õ": 11399, + "ucket": 11400, + "Ġfashion": 11401, + ".char": 11402, + "izar": 11403, + "Ġdisable": 11404, + "Ġsuc": 11405, + "ĠLive": 11406, + "issue": 11407, + "Ġmetadata": 11408, + "flags": 11409, + "ĠðŁ": 11410, + "Ġcommitted": 11411, + "Ġva": 11412, + "Ġrough": 11413, + "Ġ'''Ċ": 11414, + "Ġhighlight": 11415, + "_vars": 11416, + "VO": 11417, + "Ġencoding": 11418, + "-Z": 11419, + "_sign": 11420, + "$(\"#": 11421, + "Ġrain": 11422, + "reatest": 11423, + "ĠEND": 11424, + "Selection": 11425, + "Ġcandidates": 11426, + "Ġsav": 11427, + ".Empty": 11428, + "Ġdecisions": 11429, + "Ġcollabor": 11430, + "ridge": 11431, + "feed": 11432, + "ression": 11433, + "Ġpersons": 11434, + "VM": 11435, + "008": 11436, + "ega": 11437, + "_BIT": 11438, + "According": 11439, + "acked": 11440, + "Ġdollars": 11441, + "_loss": 11442, + "ĠCost": 11443, + "}\"Ċ": 11444, + "Notification": 11445, + "Ġprostit": 11446, + "Ġauthority": 11447, + ".rec": 11448, + "Ġspokes": 11449, + "ĠToday": 11450, + "istant": 11451, + "ĠHead": 11452, + "âĢĿ.": 11453, + "ertainment": 11454, + "cean": 11455, + "culate": 11456, + "Ġven": 11457, + "However": 11458, + "_arr": 11459, + "Ġtokens": 11460, + "Graph": 11461, + "ĠJud": 11462, + "ĠVirgin": 11463, + "ĠSerial": 11464, + "unning": 11465, + "Mutable": 11466, + "agers": 11467, + ".csv": 11468, + "Ġdeveloping": 11469, + "Ġinstructions": 11470, + "Ġpromise": 11471, + "Ġrequested": 11472, + "_encode": 11473, + "/\"": 11474, + "ĠIcon": 11475, + "uilt": 11476, + "-day": 11477, + "Ġintelligence": 11478, + ".IS": 11479, + "ĠObservable": 11480, + "ĠHard": 11481, + "Bool": 11482, + "211": 11483, + "idential": 11484, + ".Anchor": 11485, + "Ġselling": 11486, + "CI": 11487, + "AGES": 11488, + "tle": 11489, + "bur": 11490, + "UFFER": 11491, + "RY": 11492, + "Ġbigger": 11493, + "Ġrat": 11494, + "Ġfamous": 11495, + "Ġtypename": 11496, + "Ġexplained": 11497, + "}}Ċ": 11498, + "Ġnuclear": 11499, + "-N": 11500, + "Ġcrisis": 11501, + "ĠEnter": 11502, + "Ġanswers": 11503, + "/${": 11504, + "/pl": 11505, + "Ġsequ": 11506, + "_next": 11507, + "mask": 11508, + "Ġstanding": 11509, + "Ġplenty": 11510, + "ĠCross": 11511, + "ĉret": 11512, + "dro": 11513, + "ĠCast": 11514, + "167": 11515, + "=true": 11516, + "ĠChris": 11517, + "icio": 11518, + "ĠMike": 11519, + "Decimal": 11520, + "addComponent": 11521, + "Len": 11522, + "Ġcock": 11523, + "Ġ#{": 11524, + "URN": 11525, + "": 11657, + "Ġ*=": 11658, + "ĠPS": 11659, + "Ġdangerous": 11660, + "[p": 11661, + "OME": 11662, + "Other": 11663, + "ĠStringBuilder": 11664, + "Points": 11665, + "heading": 11666, + "Ġcurrency": 11667, + "Ġpercentage": 11668, + "_API": 11669, + "Ġclassic": 11670, + "thead": 11671, + "ĠMO": 11672, + "FE": 11673, + "Idx": 11674, + "await": 11675, + "Ġè": 11676, + "Ġaccident": 11677, + "Ġvariant": 11678, + "Ġmyst": 11679, + "ĠLand": 11680, + "ĠBre": 11681, + "Ġharm": 11682, + "ĠAcc": 11683, + "Ġcharged": 11684, + "iones": 11685, + "Visibility": 11686, + "arry": 11687, + "ĠLanguage": 11688, + "Ġwalking": 11689, + "\".ĊĊ": 11690, + "ifer": 11691, + "Ġleadership": 11692, + ".From": 11693, + "ynam": 11694, + "Ġtimestamp": 11695, + "ipt": 11696, + "ĠHas": 11697, + "REFER": 11698, + "ĠIts": 11699, + "Ġlistener": 11700, + "UTE": 11701, + "213": 11702, + "_description": 11703, + "Ġexperiences": 11704, + "Ġcreates": 11705, + "RS": 11706, + "cart": 11707, + "black": 11708, + "Ġchoices": 11709, + "war": 11710, + "750": 11711, + "Ġ'''": 11712, + "Ġordered": 11713, + "Ġevening": 11714, + "Ġpil": 11715, + "Ġtun": 11716, + "ĠBad": 11717, + "(app": 11718, + "random": 11719, + "Ġexplicit": 11720, + "Ġarrived": 11721, + "Ġfly": 11722, + "Ġeconom": 11723, + "-mail": 11724, + "Ġlists": 11725, + "Ġarchitect": 11726, + "234": 11727, + "ĠPay": 11728, + "Ġds": 11729, + "ĠSol": 11730, + "Ġvehicles": 11731, + "Hz": 11732, + "-com": 11733, + "Ġking": 11734, + "_equal": 11735, + "ĠHelp": 11736, + "Ġabuse": 11737, + "480": 11738, + "169": 11739, + "--;Ċ": 11740, + "Ġextr": 11741, + "Ġchemical": 11742, + "ä¿": 11743, + "Ġorient": 11744, + "Ġbreath": 11745, + "ĠSpace": 11746, + "(element": 11747, + "wait": 11748, + "DED": 11749, + "igma": 11750, + "Ġentr": 11751, + "Ġsob": 11752, + "-name": 11753, + "Ġaffected": 11754, + "ika": 11755, + "Ġcoal": 11756, + "_work": 11757, + "Ġhundreds": 11758, + "Ġpolitics": 11759, + "subject": 11760, + "Ġconsumer": 11761, + "ANGE": 11762, + "Ġrepeated": 11763, + "Send": 11764, + "Ġ#[": 11765, + "Ġprotocol": 11766, + "Ġleads": 11767, + "useum": 11768, + "Every": 11769, + "808": 11770, + "174": 11771, + "Import": 11772, + "(count": 11773, + "Ġchallenges": 11774, + "Ġnovel": 11775, + "Ġdepart": 11776, + "bits": 11777, + ".Current": 11778, + "Ġ`${": 11779, + "oting": 11780, + "(\\": 11781, + "Ġcreative": 11782, + "Ġbuff": 11783, + "Ġintroduced": 11784, + "usic": 11785, + "modules": 11786, + "Are": 11787, + "-doc": 11788, + "language": 11789, + "_cache": 11790, + "Ġtod": 11791, + "?>": 11792, + "omething": 11793, + "Ġhun": 11794, + "åº": 11795, + "aters": 11796, + "Intent": 11797, + "Ġimplemented": 11798, + "ĠCase": 11799, + "Children": 11800, + "Ġnotification": 11801, + "Renderer": 11802, + "Wrapper": 11803, + "Objects": 11804, + "tl": 11805, + ".Contains": 11806, + "Plugin": 11807, + ".row": 11808, + "Ġforg": 11809, + "Ġpermit": 11810, + "Ġtargets": 11811, + "ĠIF": 11812, + "Ġtip": 11813, + "sex": 11814, + "Ġsupports": 11815, + "Ġfold": 11816, + "photo": 11817, + "},čĊ": 11818, + "Ġgoogle": 11819, + "$('#": 11820, + "Ġsharing": 11821, + "Ġgoods": 11822, + "vs": 11823, + "ĠDan": 11824, + "Rate": 11825, + "ĠMartin": 11826, + "Ġmanner": 11827, + "lie": 11828, + ".The": 11829, + "Internal": 11830, + "ĠCONTR": 11831, + "Mock": 11832, + "RIGHT": 11833, + "Ġ'{": 11834, + "Ġcontrols": 11835, + "Mat": 11836, + "Ġmand": 11837, + "Ġextended": 11838, + "Ok": 11839, + "Ġembed": 11840, + "Ġplanet": 11841, + "ĠNon": 11842, + "-ch": 11843, + ")\",": 11844, + "epar": 11845, + "Ġbelieved": 11846, + "ĠEnvironment": 11847, + "ĠFriend": 11848, + "-res": 11849, + "Ġhandling": 11850, + "nic": 11851, + "-level": 11852, + "scri": 11853, + "Xml": 11854, + "BE": 11855, + "ungen": 11856, + "Ġalter": 11857, + "[idx": 11858, + "Pop": 11859, + "cam": 11860, + "Ġ(((": 11861, + "Ġshipping": 11862, + "Ġbattery": 11863, + "iddleware": 11864, + "MC": 11865, + "Ġimpl": 11866, + "otation": 11867, + "ĠLab": 11868, + "